def main(): ppo = PPOAgent() env = TorcsEnv(text=True, vision=False, throttle=True, gear_change=False) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, './save_model13/models/model_59557.ckpt') for i in range(1000): obs = env.reset() state = convert_obs(obs) score, step = 0, 0 while True: action = ppo.choose_action(state) next_obs, reward, done, _ = env.step(action) time.sleep(0.05) next_state = convert_obs(next_obs) score += reward step += 1 if done: print(step, score) env.reset() step, score = 0, 0 else: state = next_state
def train_sac(sess, args, sac, saver): env = TorcsEnv(vision=False, throttle=True, gear_change=False) replay_buffer = ReplayBuffer(args.s_dim, args.a_dim, args.buff_size) for ep in range(args.total_ep): if np.mod(ep, 100) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every N episode because of the memory leak error else: ob = env.reset() s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) done = False ep_rew = 0.0 ep_len = 0 while (not done): # first 10 episodes, just step on gas, drive straight if (ep > 10): a = sac.get_action(s) else: a = np.array([0.0, 1.0, 0.0]) ob, r, done, _ = env.step(a) s2 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) ep_rew += r ep_len += 1 if (ep_len >= args.max_ep_len): done = True replay_buffer.store(s, a, r, s2, float(done)) s = s2 batch = replay_buffer.sample_batch(args.batch_size) outs = sac.train(batch) print('episode: ', ep, ' | episode rewards: ', round(ep_rew, 4), ' | episode length: ', ep_len, ' | alpha/temperature: ', outs[9]) with open("performance.txt", "a") as myfile: myfile.write( str(ep) + " " + str(ep_len) + " " + str(round(ep_rew, 4)) + " " + str(round(outs[9], 4)) + "\n") if (ep % 10 == 0): # save model saver.save(sess, 'ckpt/model')
def main(): # Creating necessary directories track_no = 5 experiment_name = "tensorboard-4" experiment_dir = "experiment-%s/" % experiment_name datas_dir = experiment_dir + "datas-track-no-%d/" % track_no models_dir = datas_dir + "model/" if os.path.exists(experiment_dir) == False: print("%s dosen't exists" % experiment_dir) return if os.path.exists(datas_dir) == False: print("%s dosen't exists" % datas_dir) return if os.path.exists(models_dir) == False: print("%s dosen't exists" % models_dir) return state_dim = 4 img_dim = [304, 412, 3] sess = tf.InteractiveSession() agent = Supervise(sess, state_dim, img_dim, models_dir) agent.load_network() MAX_STEP = 10000 step = 0 vision = True env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=track_no, random_track=False, track_range=(5, 8)) for i in range(1): if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, 0.0)) i_t = ob.img # print(i_t) while step < MAX_STEP: action = agent.action(s_t, i_t) ob, reward, done, info = env.step([action, 0.16, 0]) s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, action)) i_t = ob.img print("Step", step, "Action", action, "Reward", reward) if done == True: break env.end()
def main(): count = 0 env = TorcsEnv(vision=False, throttle=True, gear_change=False, text=False) # action_fd = open('./trajectory/action15.csv', 'w', newline='') # observation_fd = open('./trajectory/observation15.csv', 'w', newline='') # action_writer = csv.writer(action_fd, delimiter=',') # observation_writer = csv.writer(observation_fd, delimiter=',') for ep in range(1, 5): done = False step, score = 0, 0 if np.mod(ep, 5) == 0: ob = env.reset(relaunch=True) state = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedY, ob.speedX, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) #observation_writer.writerow(state.tolist()) else: ob = env.reset() state = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedY, ob.speedX, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) #observation_writer.writerow(state.tolist()) while not done: keyboard.hook(get_action) time.sleep(0.05) # write action value in action.csv # observation_writer.writerow(state.tolist()) # action_writer.writerow([action[0], action[1], action[2]]) next_ob, reward, done, info = env.step(action) next_state = np.hstack( (next_ob.angle, next_ob.track, next_ob.trackPos, next_ob.speedY, next_ob.speedX, next_ob.speedZ, next_ob.wheelSpinVel / 100.0, next_ob.rpm)) #observation_writer.writerow(state.tolist()) state = next_state # print(step, score) print(next_ob.lastLapTime) score += reward step += 1 action_fd.close() observation_fd.close() sys.exit()
def sample_one(self): """ MODIFIED SAMPLING FOR TORCS! """ print print('START PLOTTING MODULE'.center(80, '=')) roll_distance = [] print print("TORCS Experiment Start".center(80, '=')) env = TorcsEnv(vision=self.config.vision, throttle=self.config.throttle) try: ob = env.reset() sonar, grayscale = self.image_to_sonar(ob.img) sonar = np.reshape(sonar, [19]) state = np.concatenate( [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0) obs, states, actions, rewards,sonars,grayscales = [], [], [], [],[],[] done = False #has the episode ended? start_time = time.time() while not done and (time.time() - start_time < 300): states.append(state) obs.append(ob) sonars.append(sonar) grayscales.append(grayscale) state = np.concatenate( [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0) action = self.sess.run( self.sampled_action, feed_dict={ self.observation_placeholder: np.reshape(state, [1, self.observation_dim]) })[0] ob, reward, done, info = env.step(action) sonar, grayscale = self.image_to_sonar(ob.img) sonar = np.reshape(sonar, [19]) #print('Action: ', action) actions.append(action) rewards.append(reward) roll_distance.append(env.distance_travelled) #print('Roll distance: ', roll_distance) except: raise finally: env.end() # This is for shutting down TORCS print("Finished TORCS session".center(80, '=')) print('Final distance: ', roll_distance[-1], ' [m]') print('END PLOTTING MODULE'.center(80, '=')) #Plot some of the frames: self.grayscales = grayscales self.sonars = sonars self.obs = obs self.actions = actions self.roll_distance = roll_distance return
def test_sac(sess, args, sac, saver): saver.restore(sess, 'ckpt/model') env = TorcsEnv(vision=False, throttle=True, gear_change=False) ob = env.reset( relaunch=True ) #relaunch TORCS every N episode because of the memory leak error s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) done = False ep_rew = 0.0 ep_len = 0 while (not done): # deterministic actions at test time a = sac.get_action(s, True) ob, r, done, _ = env.step(a) s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) ep_rew += r ep_len += 1 if (ep_len >= args.max_ep_len): done = True print('test time performance: | rewards: ', ep_rew, ' | length: ', ep_len)
def main(): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) generator = Generator(sess, feat_dim, aux_dim, encode_dim, action_dim) base_model = ResNet50(weights='imagenet', include_top=False) feat_extractor = Model( input=base_model.input, output=base_model.get_layer('activation_40').output ) try: generator.model.load_weights(param_path) print("Weight load successfully") except: print("cannot find weight") env = TorcsEnv(throttle=True, gear_change=False) print("Start driving ...") ob = env.reset(relaunch=True) feat, aux = get_state(ob, aux_dim, feat_extractor) encode = np.zeros((1, encode_dim), dtype=np.float32) encode[0, code] = 1 print "Encode:", encode[0] pre_actions = np.load(pre_actions_path)["actions"] for i in xrange(MAX_STEP_LIMIT): if i < MIN_STEP_LIMIT: action = np.zeros(3, dtype=np.float32) elif i < MIN_STEP_LIMIT + PRE_STEP: action = pre_actions[i - MIN_STEP_LIMIT] else: action = generator.model.predict([feat, aux, encode])[0] ob, reward, done, _ = env.step(action) feat, aux = get_state(ob, aux_dim, feat_extractor) if i == MIN_STEP_LIMIT + PRE_STEP: print "Start deciding ..." print "Step:", i, "DistFromStart:", ob.distFromStart, \ "TrackPos:", ob.trackPos, "Damage:", ob.damage.item(), \ "Action: %.6f %.6f %.6f" % (action[0], action[1], action[2]), \ "Speed:", ob.speedX * 200 if done: break env.end() print("Finish.")
def testDDPG(sess, args, actor, critic): # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False) episode_count = args['episode_count'] max_steps = args['max_steps'] for i in range(episode_count): if np.mod(i, 100) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every N episode because of the memory leak error else: ob = env.reset() s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) ep_reward = 0 ep_ave_max_q = 0 for j in range(max_steps): a = actor.predict(np.reshape(s, (1, actor.s_dim))) # NOISE AT TEST TIME MAY BE REQUIRED TO STABILIZE ACTIONS a[0,:] += OU(x=a[0,:], mu=mu, sigma=sigma, theta=theta) ob, r, terminal, info = env.step(a[0]) s2 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) s = s2 ep_reward += r if terminal: print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
def play_trajectory(filepath,counts): # Defining torcs environment env = TorcsEnv(vision=VISION,throttle=True,gear_change=False) states,actions = load_expert_trajectory(filepath) for i in range(counts): print("Playing count : {}".format(i)) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() state = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) for action in actions: # take action and observe reward and next state ob, reward, done, info = env.step(action) if done: break
def env_setup(joystick_id): #### Generate a Torcs environment # enable vision input, the action is steering only (1 dim continuous action) env = TorcsEnv(vision=True, throttle=False) # without vision input, the action is steering and throttle (2 dim continuous action) # env = TorcsEnv(vision=False, throttle=True) ob = env.reset( relaunch=True) # with torcs relaunch (avoid memory leak bug in torcs) # ob = env.reset() # without torcs relaunch #### Initialize a joystick controller joystick_id = 0 joypad = Controller(joystick_id)
def test(): env = TorcsEnv(vision=True, throttle=False) ob = env.reset(relaunch=True) reward_sum = 0.0 done = False count = 0 while not done: act = model.predict(img_reshape(ob.img).astype('float32') / 255) #print(act) count += 1 ob, reward, done, _ = env.step(act) reward_sum += reward env.end() print("Steps before crash: ", count, reward_sum) return count, reward_sum
def programmatic_game(tree_program, track_name='practgt2.xml'): episode_count = 2 max_steps = 100000 window = 5 # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False, track_name=track_name) logging.info("TORCS Experiment Start with Priors on " + track_name) for i_episode in range(episode_count): ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] newobs = [item for sublist in tempObs[:-1] for item in sublist] for j in range(max_steps): act_tree = tree_program.predict([newobs]) action_prior = [act_tree[0][0], act_tree[0][1], act_tree[0][2]] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] newobs = [item for sublist in tempObs[:-1] for item in sublist] ob, r_t, done, info = env.step(action_prior) if np.mod(j, 1000) == 0: logging.info("Episode " + str(i_episode) + " Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) if done: print('Done. Steps: ', j) break env.end() # This is for shutting down TORCS logging.info("Finish.")
class A3C_Agent(Process): def __init__(self, rank, global_net, counter, lock, args): super(A3C_Agent, self).__init__() self.network = A3C_Network(29, 3, lock) self.global_net = global_net self.counter = counter self.lock = lock self.args = args self.done = True self.name = '' self.env = TorcsEnv( port=3101 + rank, path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.reset() def reset(self, relaunch=None): if relaunch is None: relaunch = self.done # Synchronizing self.time_step = self.counter.value() self.network.reset(self.global_net, self.done) if self.done: self.state = self.env.reset(relaunch=relaunch, sampletrack=True, render=False) self.values = [] self.rewards = [] self.log_probs = [] self.entropies = [] def normalize_state(self): def normalize_from(x, a, b): return (2 * ((x - a) / (b - a))) - 1 self.state[1:20] = normalize_from(self.state[1:20], -0.01, 2) self.state[21:] = normalize_from(self.state[21:], 0, 1) def internal_log(self, *msg): with open(f'../../logs/{self.name}.txt', 'a+') as file: print(*msg, file=file)
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1,action_dim]) noise_t = np.zeros([1,action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 30) == 0: print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
# num_inputs = env.observation_space.shape[0] num_inputs = 30 num_outputs = env.action_space.shape[0] model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device) if LOAD_MODEL: model.load_state_dict(torch.load(model_name)) print(model) optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) frame_idx = 0 train_epoch = 0 best_reward = None # blockPrint() state = env.reset() early_stop = False while not early_stop: # generate trajectories after restarting everytime # state = envs.reset() log_probs, values, states, actions, rewards, masks, next_state = genenerateSeries( env, model) # log_probs,values,states,actions,rewards,masks,next_state = genenerateParallel(envs, model, state) frame_idx += PPO_STEPS next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) returns = compute_gae(next_value, rewards, masks, values)
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Now load the weight # print("Now we load the weight") # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) print ob.track total_reward = 0. stucked = 0 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.1: print("********Now we apply the brake***********") noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") attacks = [] for i in range(-10, 0): val = i / 10.0 attacks.append([77, val]) # for i in range(45, 55): # attacks.append([i, -1.5]) # attacks.append([i, 1.5]) for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) # if np.mod(i, 3) == 0: # ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error # else: # ob = env.reset() ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # if j == 71: # print("cp attack!") # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 if (j == attacks[i][0]): print('cp attack on {} with {}'.format(attacks[i][0], attacks[i][1])) a_t[0][0] = attacks[i][1] ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer cur_step_sample = [ s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done ] cur_sample.append(cur_step_sample) # #Do the batch update # batch = buff.getBatch(BATCH_SIZE) # states = np.asarray([e[0] for e in batch]) # actions = np.asarray([e[1] for e in batch]) # rewards = np.asarray([e[2] for e in batch]) # new_states = np.asarray([e[3] for e in batch]) # dones = np.asarray([e[4] for e in batch]) # y_t = np.asarray([e[1] for e in batch]) # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) # for k in range(len(batch)): # if dones[k]: # y_t[k] = rewards[k] # else: # y_t[k] = rewards[k] + GAMMA*target_q_values[k] # if (train_indicator): # loss += critic.model.train_on_batch([states,actions], y_t) # a_for_grad = actor.model.predict(states) # grads = critic.gradients(states, a_for_grad) # actor.train(states, grads) # actor.target_train() # critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 200: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0], attacks[i][1]) with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file: the_file.write(s) # overall_scores.append(total_reward) # plt.clf() # plt.plot(overall_scores) # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")
(1 - self.tau) * \ critic_target_weights[i] self.critic_target.set_weights(critic_target_weights) agent = DDPGAgent() env = TorcsEnv(vision=False, throttle=True, gear_change=False) print('testing sample agent on torcs') global_step = 0 for e in range(2000): step = 0 score = 0 if e % 5 == 0: observe = env.reset(relaunch=True) print("Now we save model") agent.actor.save_weights("ddpg_actor.h5", overwrite=True) agent.critic.save_weights("ddpg_critic.h5", overwrite=True) else: observe = env.reset() # get necessary information from the observation state = np.hstack((observe.angle, observe.track, observe.trackPos, observe.speedX, observe.speedY, observe.speedZ, observe.wheelSpinVel / 100.0, observe.rpm)) done = False while not done: step += 1 global_step += 1
def playGame(checkpoints=None, train_indicator=1, eps=1.0): #1 means Train, 0 means simply Run BUFFER_SIZE = 40000 BATCH_SIZE = 16 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.01 #Learning rate for Actor LRC = 0.05 #Lerning rate for Critic vision = True action_dim = 3 #Steering/Acceleration/Brake if vision: state_dim = (64, 64, 3) #of sensors input else: state_dim = 29 np.random.seed(1337) EXPLORE = 1000000. episode_count = 2000 max_steps = 8000000 reward = 0 done = False step = 0 epsilon = eps indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA, vision, summary_writer) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC, vision) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer history = History() # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) log_file = open('train_log.log', 'w') #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints)) actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.target_model.load_weights( "criticmodel_{}.h5".format(checkpoints)) print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") max_reward = 0 min_reward = 0 for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() if vision: history.fill((ob.img)) s_t = history.get() else: s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. total_damage = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) if vision: a_t_original = actor.model.predict( s_t.reshape((-1, ) + state_dim)) else: a_t_original = actor.model.predict(s_t.reshape( 1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.30, 0.30) noise_t[0][1] = 0.1 + train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) damage = ob.damage if vision: last_s_t = history.get().copy() history.add((ob.img)) next_s_t = history.get().copy() if np.mod(step, 4) == 0: buff.add(last_s_t, a_t[0], r_t, next_s_t, done) #Add replay buffer s_t1 = history.get() else: s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) if vision: target_q_values = critic.target_model.predict([ new_states.reshape((-1, ) + state_dim), actor.target_model.predict(new_states).reshape( (-1, ) + (action_dim, )) ]) else: target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator and buff.count() >= 1000: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t total_damage += damage s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel_{}.h5".format(i), overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel_{}.h5".format(i), overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) max_reward = max(max_reward, total_reward) min_reward = min(min_reward, total_reward) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward) + " EPS " + str(epsilon)) print("Total Step: " + str(step) + ' Max: ' + str(max_reward) + ' Min: ' + str(min_reward)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1, safety_constrain_flag=True): #1 means Train, 0 means simply Run plt.ion() args = parser.parse_args() np.random.seed(1337) #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Define two intra-policies overtaking_policy = ActorNetwork(sess, args.state_size, args.action_size) following_policy = ActorNetwork(sess, args.state_size, args.action_size) try: overtaking_policy.model.load_weights("actormodel_overtaking.h5") overtaking_policy.target_model.load_weights("actormodel_overtaking.h5") following_policy.model.load_weights("actormodel_following.h5") following_policy.target_model.load_weights("actormodel_following.h5") print("Weight load successfully") except: print("Cannot find the weight") # with fixed following policy #option_policies = [overtaking_policy,overtaking_policy,overtaking_policy,following_policy(0.5),following_policy(0.5),following_policy(0.5)] # with learned following policy option_policies = [ overtaking_policy, overtaking_policy, overtaking_policy, following_policy, following_policy, following_policy ] termination_steps = [10, 20, 30, 10, 20, 30] # Define option-value function Q_Omega(s,omega): estimate values upon arrival critic = OptionValueCritic(args.state_size, args.option_size, args.discount, args.learning_rate_critic, args.epsilon, args.epsilon_min, args.epsilon_decay, args.tau) try: critic.load("option_value_model.h5") print("Critic Weight load successfully") except: print("Cannot find the critic weight") history = np.zeros((args.nepisodes, 2)) # Define a buffer space to store samples buff = ReplayBuffer(args.buffer_size) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=args.vision, throttle=True, gear_change=False) print("TORCS Experiment Start.") cumreward_list = [] average_step_reward_list = [] damage_rate_list = [] epsilon_list = [] results_list = [] option_list = [] trackPos_list = [] speed_list = [] epreward_list = [] for episode in range(args.nepisodes): # Define variables to store values cumreward = 0. duration = 1 option_switches = 0 avgduration = 0. reward_option = 0 total_options = 0 damage_times = 0 danger_time = 0 collision_time = 0 primitive_action_step = 0 if np.mod(episode, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() state = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) state = state.reshape(1, state.shape[0]) for step in range(args.nsteps): total_options += 1 option = critic.get_option(state, train_indicator) reward_option = 0 for i in range(termination_steps[option]): primitive_action_step += 1 action = option_policies[option].target_model.predict(state) ''' if option == 0 or option == 1 or option == 2: action = option_policies[option].target_model.predict(state) else: action = option_policies[option].act(ob) ''' print(action) action = Low_level_controller(action[0][0], action[0][1], ob, safety_constrain_flag) print("Option: {} Action:{}".format(option, action)) ob, r_t_primitive, done, _ = env.step(action) if r_t_primitive == -30.0: collision_time += 1 elif r_t_primitive == -5.0: danger_time += 1 damage_times = collision_time + danger_time option_list.append(option) trackPos_list.append(ob.trackPos) speed_list.append(ob.speedX) epreward_list.append(r_t_primitive) reward_option = reward_option + args.discount**( i) * r_t_primitive state_ = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) state_ = state_.reshape(1, state_.shape[0]) state = state_ if done: break buff.add(state, option, reward_option, state_, done) cumreward += reward_option reward_ep_per_step = cumreward / primitive_action_step damage_rate = damage_times / primitive_action_step if done: break if train_indicator: batch = buff.getBatch(args.batch_size) critic.replay(batch) if episode % 10 == 0: critic.save("option_value_model.h5") if train_indicator: # Save the results cumreward_list.append(cumreward) average_step_reward_list.append(reward_ep_per_step) damage_rate_list.append(damage_rate) epsilon_list.append(critic.epsilon) results_list = [ cumreward_list, average_step_reward_list, damage_rate_list, epsilon_list ] sio.savemat( 'results_both_learned.mat', { 'total_reward': cumreward_list, 'average_reward': average_step_reward_list, 'epsilon': epsilon_list, 'damage_rate': damage_rate_list }) else: sio.savemat( 'test1lf1r.mat', { 'ep_reward': epreward_list, 'option': option_list, 'trackPos': trackPos_list, 'speed': speed_list }) print('damage rate is:', damage_rate) history[episode, 0] = step history[episode, 1] = avgduration plt.figure(1) plt.hold(True) plt.subplot(311) plt.plot(episode, cumreward, 'ro') plt.xlabel('episode') plt.ylabel('Total reward per epsiode') plt.subplot(312) plt.hold(True) plt.plot(episode, cumreward / total_options, 'bo') plt.xlabel('episode') plt.ylabel('Average reward per option') plt.subplot(313) plt.hold(True) plt.plot(episode, critic.epsilon, 'go') plt.xlabel('episode') plt.ylabel('epsilon') plt.draw() plt.show() plt.pause(0.001) env.end() # This is for shutting down TORCS plt.savefig('test.png') print("Finish.")
def main(): global play, replay_buffer, q_act_net, q_target_net, step #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) tstr = time.strftime('_%H_%M_%S_') first_run = False #don't use saved nets/buffer reset_buffer = True #don't use saved buffer #dqn params GAMMA = 0.99 TAU = .0001 #.0001 #exploration noise params act_noise_init = 0.5 #.75 #.25 act_noise_final = .01 # .25 act_noise_interval = 100000 rnd_range = 1 #lag augmentation packet_lost = 0 #0.01 #action smoothing augmentation lambda_spatial_q = 0 action_smoother = .33 action_limiter = .33 episode_count = 10000 max_steps = 5000 save_in_iters = 15000 #100000 #start training after accumulating train_start_num samples train_start_num = 1 * BATCH_SIZE caffe.set_mode_gpu() caffe.set_device(0) #balance track samples in replay buffer track_balance = .9 #n-steps dqn, steps=max_n_batches*batch size max_n_batches = 16 # use n-steps dqn for this ratio, rest 1-step n_step_ratio = .75 #priority buffer k_priority_try = 2 if n_step_ratio > 0 and max_n_batches > 0: n_steps_dqn = True else: n_steps_dqn = False #average speed c_speed = 35 # 50 #speed variance delta_speed = 5 # 7.5 #switch speed in swith_count_min = 50 #target frame time frame_rate = .4 #fail if lag is more then t_delta_fail = frame_rate * 1.75 #for rebound handling rebound_count_max = 5 start_run = 50 # for error handling after_start_check = 100 max_errors = 50 #solver for current net if not play: critic_solver = caffe.get_solver( current_dir + 'resnet_torcs/dqn_critic_solver.prototxt') if first_run: # target net: q_target_net = caffe.Net( current_dir + 'resnet_torcs/critic_batch_dqn.prototxt', current_dir + 'r18nb.caffemodel', caffe.TEST) # current net: q_act_net = caffe.Net( current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt', caffe.TEST) if not play: ParamCopy(critic_solver.net.params, q_target_net.params) ParamCopy(q_act_net.params, q_target_net.params) else: #target net: q_target_net = caffe.Net( current_dir + 'resnet_torcs/critic_batch_dqn.prototxt', 'qq_target.caffemodel', caffe.TEST) #current net: q_act_net = caffe.Net( current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt', 'q_solver.caffemodel', caffe.TEST) if not play: ParamCopy(critic_solver.net.params, q_act_net.params) if not play and not reset_buffer: print 'loading replay_buffer buffer' replay_buffer = load_replay() replay_buffer.size_reduce(BUFFER_SIZE) print 'replay_buffer buffer loaded' print 'models loaded ***************************' if not play: assert q_target_net.blobs['state'].data.shape[0] == BATCH_SIZE assert q_act_net.blobs['state'].data.shape[0] == 1 assert critic_solver.net.blobs['state'].data.shape[0] == BATCH_SIZE assert q_target_net.blobs['state'].data.shape[1] == CHANNELS assert q_act_net.blobs['state'].data.shape[1] == CHANNELS assert critic_solver.net.blobs['state'].data.shape[1] == CHANNELS assert q_target_net.blobs['q_action'].data.shape[1] == DISCR_A assert q_act_net.blobs['q_action'].data.shape[1] == DISCR_A assert critic_solver.net.blobs['q_action'].data.shape[1] == DISCR_A max_reached_step = 150 #used for track balance images_history = [] #used for input image step = 0 #total number of simulation steps save_count = 0 #used for saving nets/buffer n_batch = 0 #used for n-steps q_loss = 0 #main loss # Generate a Torcs environment env = TorcsEnv(vision=True, throttle=False, observer=False) time_start = time.time() track_id = 0 #track #n-step temp vars n_steps_cont_from_prev = False prev_start_pos = -1 prev_track_id = -1 Qlast = -1 episod_steps = 0 n_steps_used = 0 batches_used = 0 #for error failure rest_fail = 0 rebound_events = 0 for i in range(episode_count): #balance tracks if episod_steps >= max_reached_step * track_balance: track = t_list[track_id] change_track( "/usr/local/share/games/torcs/config/raceman/quickrace.xml", track) print "Track: ", track, "track_id", track_id episod_steps = 0 print("Episode : " + str(i)) ob = env.reset(relaunch=True) s_t = None #input image total_reward = 0. #for randomizing velocity switch_count = swith_count_min + random.randint(0, swith_count_min) #for handling out-of-lane rebound = False rebound_count = 0 track_pos = 0 error_count = 0 act_prev = np.array([0.]) t_delta = 0 for j in range(max_steps): max_reached_step = max(max_reached_step, j) a_t = np.array([0.]) #action skip_state = False error_present = False #exploration noise params act_noise = act_noise_init + ( act_noise_final - act_noise_init) * min( step * 1. / act_noise_interval, 1.) rnd_noise = 1 if rnd_range > 1: rnd_noise = int( (rnd_range + 1) * max(1., float(act_noise_interval - step) / act_noise_interval)) #get action ======================================================= if s_t is None: action_index = random.randrange(DISCR_A) print '----------Random Action---------- action_index', action_index a_t[0] = ind2a(action_index, DISCR_A, DELTA_A) else: a_t[0] = qchoice(q_act_net, s_t, CHANNELS, DISCR_A, DELTA_A) #apply exploration noise if not play and random.random() <= act_noise: ind = a2ind(a_t[0], DISCR_A, DELTA_A) r = 1 if rnd_noise > 1: r = randint(1, rnd_noise) ind += randint(-r, r) ind = min(max(ind, 0), DISCR_A - 1) a_t[0] = ind2a(ind, DISCR_A, DELTA_A) #if still no action use random if a_t is None: action_index = random.randrange(DISCR_A) print 'rnd action_index', action_index a_t[0] = ind2a(action_index, DISCR_A, DELTA_A) #starting area if j < start_run: a_t[0] = 0 #action limiter if not play and abs( a_t[0]) > DELTA_A / 2 and random.random() < action_limiter: ind = a2ind(a_t[0], DISCR_A, DELTA_A) dind = ind - DISCR_A / 2 if dind > (DISCR_A - 1) / 4: dind = (DISCR_A - 1) / 4 if dind < -(DISCR_A - 1) / 4: dind = -(DISCR_A - 1) / 4 a_t[0] = ind2a(dind + DISCR_A / 2, DISCR_A, DELTA_A) #save action a_0_list.append(a_t) #fail on render delay if not play and t_delta > t_delta_fail and i > rest_fail + 10 and j >= after_start_check: error_present = True if error_count >= max_errors / 2: print 'delta fail **************************' rest_fail = i break else: error_count += 1 #randomize speed if (j % switch_count and not play) == 0: tag_speed_rnd = c_speed - delta_speed + random.uniform( 0, delta_speed * 2) else: tag_speed_rnd = c_speed #render delay compensation if t_delta > frame_rate: tag_speed = frame_rate / t_delta * tag_speed_rnd else: tag_speed = tag_speed_rnd #handle out-of-lane event if rebound: rebound_count = rebound_count_max else: rebound_count = max(0, rebound_count - 1) if (rebound_count > rebound_count_max / 2 and abs(track_pos) > .7) or rebound: angle = -observation.angle if angle * track_pos > 0 and abs(angle) > .2: a_t[0] = -sign(track_pos) * 4 * DELTA_A / 5 if angle * track_pos > 0 and abs(angle) <= .2: a_t[0] = -sign(track_pos) * 2 * DELTA_A / 5 if angle * track_pos < 0 and abs(angle) <= .15: a_t[0] = -sign(track_pos) * DELTA_A / 5 if angle * track_pos < 0 and abs(angle) > .15: a_t[0] = 0 if angle * track_pos < 0 and abs(angle) >= .35: a_t[0] = sign(track_pos) * DELTA_A / 5 tag_speed = min(tag_speed, 20) print "############ rebound, action", a_t[ 0], "V angle", angle, "###############" #smooth action if not play and action_smoother > 0 and random.random( ) < action_smoother: ind_prev = a2ind(act_prev[0], DISCR_A, DELTA_A) ind = a2ind(a_t[0], DISCR_A, DELTA_A) if abs(ind - ind_prev) > 1: print "smooth ind", ind, "->", np.rint(.5 * (ind_prev + ind)) ind = int(.5 * (ind_prev + ind)) a_t[0] = ind2a(ind, DISCR_A, DELTA_A) a_act = a_t #lag augemntaion if not play and random.random < packet_lost and t_delta < frame_rate: a_act = act_prev #===================== main enviroment step ========================================= obs0 = time.time() prev_rebound = rebound observation, r_t, done, rebound, _ = env.step(a_act, tag_speed) curr_time = time.time() t_delta = curr_time - time_start time_start = curr_time #==================================================================================== if rebound and not prev_rebound: rebound_events += 1 print 't_delta', t_delta, "step", j, "step time", curr_time - obs0, "tag_speed_rnd", tag_speed_rnd, "rebound_events", rebound_events if rebound: r_t = 0 if prev_rebound and r_t == 0: skip_state = True #speed failure, could be moved to gym_torcs if observation.speedX < .01 and j >= after_start_check and t_delta < t_delta_fail: skip_state = True error_present = True r_t = 0 if error_count >= max_errors: print 'speed too slow fail, speed', 300 * observation.speedX, '**************************' break else: error_count += 1 #make state ======================================================== image = observation.img images_history.append(image) while len(images_history) > CHANNELS + 1: images_history.pop(0) s_t1 = make_state(images_history, CHANNELS) track_pos = observation.trackPos #save stat reward_list.append(r_t) track_list.append(track_pos) yspeed_list.append(observation.speedY) #store data into replay buffer ====================================== do_store = not play and s_t is not None and s_t1 is not None and not skip_state if do_store: print 'add data, action', a_t[0], 'reward ', r_t w_p = j replay_buffer.add(s_t, a_t, r_t, s_t1, done, w_p, track_id, -1, -1) print '***** stored: track_pos', track_pos, 'angle', observation.angle,\ 'max_step', max_reached_step, 'Episode', i elif not play: print 'skipped state track_pos', track_pos, 'angle', observation.angle,\ 'max_step', max_reached_step, 'Episode', i #training ====================================== if not play and replay_buffer.num_experiences > train_start_num: #get batch using n-steps if previous batch was using n-step use_n_steps_now = n_steps_dqn if n_batch >= max_n_batches: use_n_steps_now = False n_batch = 0 if n_steps_cont_from_prev and use_n_steps_now and max_n_batches > 1: assert prev_start_pos >= 0 batch, n_steps_collected, prev_start_pos, prev_track_id =\ replay_buffer.getBatch4Pos(BATCH_SIZE, prev_start_pos, prev_track_id) n_step_continued = n_steps_collected else: n_step_continued = False if n_steps_used >= n_step_ratio * batches_used and not n_step_continued: use_n_steps_now = False #get batch if previous batch was *not* using n-step if not n_step_continued: batch, n_steps_collected, prev_start_pos, prev_track_id =\ replay_buffer.getBatch(BATCH_SIZE, max_n_batches, k_priority_try, n_steps=use_n_steps_now) #net training ============= q_loss, Qlast = train_on_batch(batch, q_target_net, critic_solver, DISCR_A, DELTA_A, BATCH_SIZE, GAMMA, n_steps_collected, n_step_continued, Qlast, lambda_spatial_q) #update n-step vars if n_steps_collected: n_batch += 1 n_steps_used += 1 else: n_batch = 0 batches_used += 1 n_steps_cont_from_prev = n_steps_collected and prev_start_pos >= 0 # target update ============== SoftUpdate(q_target_net.params, critic_solver.net.params, TAU) ParamCopy(q_act_net.params, critic_solver.net.params) save_count += 1 #save loss if not play: q_loss_list.append(q_loss) #update local vars s_t = s_t1 act_prev = a_t if done: s_t = None if not error_present: error_count = max(0, error_count - 1) total_reward += r_t episod_steps += 1 step += 1 if done: break #save nets and buffer if not play and save_count >= save_in_iters: print "start save", save_count, step save_count = 0 save_nets(q_act_net, q_target_net, step, replay_buffer) save_state(a_0_list, a_1_list, q_loss_list, reward_list, track_list, yspeed_list, tstr + str(step)) track_id = (track_id + 1) % len(t_list) print("TOTAL REWARD @ " + str(i) + " -th Episode : " + str(total_reward)) print("Total Step: " + str(step)) print("") print("Finishing torcs.") env.end() # This is for shutting down TORCS #save nets and buffer if not play: save_state(a_0_list, a_1_list, q_loss_list, reward_list, track_list, yspeed_list, tstr + str(step)) save_nets(q_act_net, q_target_net, step, replay_buffer, "_finished") print 'Finish'
def playGame(f_diagnostics, train_indicator, agent, port=3101): # 1 means Train, 0 means simply Run action_dim = 3 #Steering/Acceleration/Brake state_dim = 65 #of sensors input env_name = 'Torcs_Env' save_location = "./weights/" # Generate a Torcs environment print("I have been asked to use port: ", port) env = TorcsEnv(vision=False, throttle=True, gear_change=False, main=1) ob = None while ob is None: try: client = snakeoil3.Client(p=port, vision=False) # Open new UDP in vtorcs client.MAX_STEPS = np.inf client.get_servers_input(0) # Get the initial input from torcs obs = client.S.d # Get the current full-observation from torcs ob = env.make_observation(obs) s_t = np.hstack((ob.angle, ob.track, ob.trackPos, \ ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) except: pass EXPLORE = total_explore episode_count = max_eps max_steps = max_steps_eps epsilon = epsilon_start done = False epsilon_steady_state = 0.01 # This is used for early stopping. totalSteps = 0 best_reward = -100000 running_avg_reward = 0. print("TORCS Experiment Start.") for i in range(episode_count): save_indicator = 0 early_stop = 1 total_reward = 0. info = {'termination_cause': 0} distance_traversed = 0. speed_array = [] trackPos_array = [] print('\n\nStarting new episode...\n') print("Initial memory consumption: ") for step in range(max_steps): # Take noisy actions during training if (train_indicator == 1): epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, epsilon_steady_state) a_t = agent.noise_action( s_t, epsilon) #Take noisy actions during training else: a_t = agent.action(s_t) try: ob, r_t, done, info = env.step(step, client, a_t, early_stop) if done: break analyse_info(info, printing=False) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \ ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) distance_traversed += ob.speedX * np.cos( ob.angle) #Assuming 1 step = 1 second if (math.isnan(r_t)): r_t = 0.0 for bad_r in range(50): print('Bad Reward Found') break #Introduced by Anirban # Add to replay buffer only if training if (train_indicator): agent.perceive(s_t, a_t, r_t, s_t1, done) # Add experience to replay buffer except Exception as e: print("Exception caught at port " + str(i) + str(e)) ob = None while ob is None: try: client = snakeoil3.Client( p=port, vision=False) # Open new UDP in vtorcs client.MAX_STEPS = np.inf client.get_servers_input( 0) # Get the initial input from torcs obs = client.S.d # Get the current full-observation from torcs ob = env.make_observation(obs) except: pass continue total_reward += r_t s_t = s_t1 # Displaying progress every 15 steps. if ((np.mod(step, 15) == 0)): print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t) totalSteps += 1 if done: break # Saving the best model. running_avg_reward = running_average(running_avg_reward, i + 1, total_reward) if train_indicator == 1: #Save network after every 20 episodes and store the data if np.mod(i, 20) == 0: agent.saveNetwork(i) #Saving training data for client for analysis if train_indicator == 1 and np.mod(i, 5) == 0: f1 = open(str(port) + ".csv", "a+") client.printAnalysis(f1, i) f1.close() print("TOTAL REWARD @ " + str(i) +"-th Episode : Num_Steps= " + str(step) + "; Max_steps= " \ + str(max_steps) +"; Reward= " + str(total_reward) + \ "; Running average reward= " + str(running_avg_reward)) print("Total Step: " + str(totalSteps)) print("") print(info) try: if 'termination_cause' in info.keys( ) and info['termination_cause'] == 'hardReset': print('Hard reset by some agent') ob, client = env.reset(client=client, relaunch=True) else: ob, client = env.reset(client=client, relaunch=True) except Exception as e: print("Exception caught at point B at port " + str(i) + str(e)) ob = None while ob is None: try: client = snakeoil3.Client( p=port, vision=False) # Open new UDP in vtorcs client.MAX_STEPS = np.inf client.get_servers_input( 0) # Get the initial input from torcs obs = client.S.d # Get the current full-observation from torcs ob = env.make_observation(obs) except: print("Exception caught at at point C at port " + str(i) + str(e)) s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \ ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) env.end() # This is for shutting down TORCS f1.close() print("Finish.")
def sample_path(self, num_episodes=None): """ MODIFIED FOR TORCS! Sample path for the environment. Args: num_episodes: the number of episodes to be sampled if none, sample one batch (size indicated by config file) Returns: paths: a list of paths. Each path in paths is a dictionary with path["observation"] a numpy array of ordered observations in the path path["actions"] a numpy array of the corresponding actions in the path path["reward"] a numpy array of the corresponding rewards in the path total_rewards: the sum of all rewards encountered during this "path" """ episode = 0 episode_rewards = [] episode_roll_distances = [] paths = [] t = 0 i = 0 print print("TORCS Experiment Start".center(80, '=')) env = TorcsEnv(vision=self.config.vision, throttle=self.config.throttle) #print('Num episodes', num_episodes) print('Using a batch size of: ', self.config.batch_size) try: while (num_episodes or t < self.config.batch_size): i += 1 print('t', t, 'i', i) #Avoid a memory leak in TORCS by relaunching if np.mod(i, 10) == 0: state = env.reset() else: state = env.reset(relaunch=True) state = np.concatenate([ state.track, np.array([state.speedX, state.speedY, state.speedZ]) ], axis=0) states, actions, rewards = [], [], [] episode_reward = 0 for step in range(self.config.max_ep_len): states.append(state) #print('State', state) action = self.sess.run(self.sampled_action, feed_dict={ self.observation_placeholder: np.reshape( states[-1], [1, self.observation_dim]) })[0] state, reward, done, info = env.step(action) #print('\n State track', state.track) #print('\n State focus', state.focus) state = np.concatenate([ state.track, np.array([state.speedX, state.speedY, state.speedZ]) ], axis=0) #print('State', state) #print('Reward', reward) #print('info', info) actions.append(action) rewards.append(reward) episode_reward += reward t += 1 if (done or step == self.config.max_ep_len - 1): episode_rewards.append(episode_reward) episode_roll_distances.append(env.distance_travelled) break if (not num_episodes) and t == self.config.batch_size: break path = { "observation": np.array(states), "reward": np.array(rewards), "action": np.array(actions) } paths.append(path) episode += 1 if num_episodes and episode >= num_episodes: break finally: env.end() # This is for shutting down TORCS print("Finished TORCS session".center(80, '=')) return paths, episode_rewards, episode_roll_distances
done = False step = 0 # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=False) agent = Agent(1) # steering only print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i)) if np.mod(i, 3) == 0: # Sometimes you need to relaunch TORCS because of the memory leak error ob = env.reset(relaunch=True) else: ob = env.reset() total_reward = 0. for j in range(max_steps): action = agent.act(ob, reward, done, vision) ob, reward, done, _ = env.step(action) #print(ob) total_reward += reward step += 1 if done: break
def main(): # 전문가 데이터 load # expert_states = np.genfromtxt('./observation_ddpg.csv', delimiter=',', dtype=np.float32) # expert_actions = np.genfromtxt('./action_ddpg.csv', delimiter=',', dtype=np.float32) expert_states = np.load('./expert_state.npy') expert_actions = np.load('./expert_action.npy') # Env, model load env = TorcsEnv(vision=False, throttle=True, text=True, gear_change=False) ppo = PPOAgent() D = Discriminaor() # saver = tf.train.Saver() saver = tf.train.import_meta_graph( './save_model10/max_score/max_model_111352.8971768222.ckpt.meta') score_buf, graph_d_reward = [], [] MAX_STEP = 906 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore( sess, './save_model10/max_score/max_model_111352.8971768222.ckpt') max_score, ep_score, max_ep_score, change_count = 0, 0, 0, 0 for ep in range(NUM_EPISODE): action_buf, state_buf, reward_buf = [], [], [] step, score = 0, 0 done = False ep_score = 0 # memory 때문에 20번마다 한번씩 relaunch if np.mod(ep, 20) == 0: obs = env.reset(relaunch=True) else: obs = env.reset() state = convert_obs(obs) while not (step == MAX_STEP): if done: print('\nDone: {0}\n'.format(ep_score)) if ep_score > max_ep_score and ep_score > 25000 and MAX_STEP == 302: max_score = score saver.save( sess, './save_model20/per_episode/epMAX_' + str(step) + '_' + str(ep_score) + '.ckpt') print( '\n########## update max score and save model #########\n' ) obs = env.reset() state = convert_obs(obs) max_ep_score = ep_score ep_score = 0 step += 1 action = ppo.choose_action(state) next_obs, reward, done, _ = env.step(action) state_buf.append(state) action_buf.append(action) reward_buf.append(reward) score += reward ep_score += reward next_state = convert_obs(next_obs) state = next_state print('\r{}/{}'.format(step, MAX_STEP), flush=True, end='') score_buf.append(score) # Discriminator Train for _ in range(2): # sample_indices = (np.random.randint(low=0, high=expert_states.shape[0], size=MAX_STEP)) # inp = [expert_states, expert_actions] # sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data start_idx = np.random.randint(low=0, high=4799) start_idx = start_idx * 280 sampled_expert_s = expert_states[start_idx:start_idx + len(state_buf), :] sampled_expert_a = expert_actions[start_idx:start_idx + len(action_buf), :] D.train(expert_s=sampled_expert_s, expert_a=sampled_expert_a, agent_s=np.vstack(state_buf), agent_a=np.vstack(action_buf)) d_rewards = D.get_rewards(agent_s=np.vstack(state_buf), agent_a=np.vstack(action_buf)) d_reward_buf = [np.asscalar(r) for r in d_rewards] graph_d_reward.append(sum(d_reward_buf)) if done: last_value = 0.0 else: last_value = ppo.get_value(next_state) discounted_reward = [] for r in d_reward_buf[::-1]: last_value = r + GAMMA * last_value discounted_reward.append(last_value) discounted_reward.reverse() batch_action = np.vstack(action_buf) batch_state = np.vstack(state_buf) batch_discount_reward = np.array(discounted_reward)[:, np.newaxis] ppo.update(batch_state, batch_action, batch_discount_reward) # if score > max_score and score > 90000: # max_score = score # saver.save(sess, './save_model17/max_score/max_model_'+str(max_score)+'.ckpt') # print('\n########## update max score and save model #########\n') if ep % 50 == 0 and ep > 0: fig = plt.figure(figsize=(16, 8)) plt.xlabel('EP') plt.ylabel('SCORE') plt.plot(list(range(len(score_buf))), score_buf, c='r', lw=1, ls='-') fig.savefig('./save_model20/graph/env_reward_graph.png') fig.clear() plt.clf() if ep % 50 == 0 and ep > 0: fig = plt.figure(figsize=(16, 8)) plt.xlabel('EP') plt.ylabel('SCORE') plt.plot(list(range(len(graph_d_reward))), graph_d_reward, c='b', lw=1, ls='-') fig.savefig('./save_model20/graph/d_reward_graph.png') fig.clear() plt.clf() print('\n@@@@@@@@@@ save model(per 200 ep) @@@@@@@@@@\n') print( '\nEp: {0}\tScore(Env): {1:.6}\tReward(D): {2:.6}\tStep: {3}\n' .format(ep, score, sum(d_reward_buf), step)) if score > 90000 and MAX_STEP == 906: change_count += 1 if change_count == 100: MAX_STEP = 604 ep_score, max_ep_score = 0, 0 elif score > 60000 and MAX_STEP == 604: change_count += 1 if change_count == 200: MAX_STEP = 302 ep_score, max_ep_score = 0, 0 os.system('pkill torcs')
def img_reshape(input_img): _img = np.transpose(input_img, (1, 2, 0)) _img = np.flipud(_img) _img = np.reshape(_img, (1, img_dim[0], img_dim[1], img_dim[2])) return _img images_all = np.zeros((0, img_dim[0], img_dim[1], img_dim[2])) actions_all = np.zeros((0,action_dim)) rewards_all = np.zeros((0,)) img_list = [] action_list = [] reward_list = [] env = TorcsEnv(vision=True, throttle=False) ob = env.reset(relaunch=True) print('Collecting data...') for i in range(steps): if i == 0: act = np.array([0.0]) else: act = get_teacher_action(ob) if i%100 == 0: print(i) ob, reward, done, _ = env.step(act) img_list.append(ob.img) action_list.append(act) reward_list.append(np.array([reward]))
def playGame(train_indicator=1): # 1 means Train, 0 means simply Run BUFFER_SIZE = 100000 # 缓存能力,网络储存能力 BATCH_SIZE = 32 # 批尺寸,一次处理样本数 GAMMA = 0.99 # 折扣系数 TAU = 0.001 # Target Network HyperParameters 目标网络超系数 LRA = 0.0001 # Learning rate for Actor Actor网络学习率 LRC = 0.001 # Lerning rate for Critic Critic网络学习率 action_dim = 3 # Steering/Acceleration/Brake 加速/转向/刹车 state_dim = 29 # of sensors input 29个传感器输入 np.random.seed(1337) # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。 vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU 管理策略,此处使用动态内存申请策略 config = tf.ConfigProto() config.gpu_options.allow_growth = True # 硬性限制GPU使用率为0.4 # config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) # Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") theTime = datetime.datetime.now() # 获取系统当前时间 theTime = theTime.strftime('%y-%m-%d_%H:%M:%S') # 转换为字符串形式作为CSV文件头 folder_path = "practise_progress/" + theTime + "/" # 只适用于Linux系统 if not os.path.exists(folder_path): os.makedirs(folder_path) print("folder created") else: print("folder existed") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv" fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ", "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"] csvFile = open(csvfileHeader, "w") writer = csv.writer(csvFile) writer.writerow(fileHeader) for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) # The following code do the stochastic brake # if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300, a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss] """ 参数记录 轮次 步骤计数 车辆位置 X轴速度 Y轴速度 Z轴速度 加速输出 转向输出 刹车输出 回报 损失函""" writer.writerow(csvData) print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: csvFile.close() break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input vision = False episode_count = 1 max_steps = 1000 #100000 done = False step = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, 1, TAU, LRA) # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight print("Now we load Actor model's weights") try: actor.model.load_weights("actormodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) ob, r_t, done, info = env.step(a_t_original[0]) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward += r_t s_t = s_t1 if np.mod(j, 100) == 0: print("Episode", i, "Step", step, "Action", a_t_original[0], "Reward", r_t) step += 1 if done: break print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
target_critic_model = None update_op, action_gradient_holder = get_actor_update_operation(actor_model) gradient_op = get_gradient_operation(critic_model) sess.run(tf.global_variables_initializer()) buffer = pd.DataFrame(columns=['previous observation', 'action', 'reward', 'observation', 'done']) def safe_norm(x): xmax = np.max(x) return np.linalg.norm(x / xmax) * xmax for episode in range(4000): print('Episode: ', episode) if episode %1 ==0: ob = env.reset(relaunch=True) # with torcs relaunch (avoid memory leak bug in torcs) else: ob = env.reset() for move in range(10000): if TARGET_MODEL: action = act(target_actor_model, observation_formatter(ob)) else: action = act(actor_model, observation_formatter(ob)) action = action.flatten() new_ob, reward, done, _ = env.step(action) reward = reward/400 print('\nq-value: ', target_critic_model.predict(observation_formatter(ob, action))) print('reward: ', reward, '\n') if np.isnan(reward): break buffer.loc[len(buffer), :] = [ob, action, reward, new_ob, done]
self.last_state = None self.last_pi = 0 self.last_a = 0 sess = tf.Session() avpNet = AvpNet(sess) vspNet = VspNet(sess) sess.run(tf.global_variables_initializer()) game_ = Game(sess, avpNet, vspNet) coach = Coach(game_, avpNet) env = TorcsEnv(vision=True, throttle=False) obs = env.reset() steer_angle = 0.0 reward = 0.0 max_eps_steps = 10000 episode_count = 2000 for i in range(episode_count): if np.mod(i, 3) == 0: obs = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: obs = env.reset() for _ in range(max_eps_steps): coach.step += 1 image = obs.img image = np.reshape(image, (64, 64, 3))
def playGame(train_indicator=0): #1 means Train, 0 means simply Run time.sleep(1) BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) pre_model = load_model("weights_rescale_all-0000.hdf5") # x = np.array([ 4.82767379e-01, 5.92105016e-02, 3.61700505e-01, 2.74807483e-01, # 2.31401995e-01, 2.07236990e-01, 1.95800006e-01, 1.89892501e-01, # 1.84837490e-01, 1.81293502e-01, 1.77807003e-01, 1.74377009e-01, # 1.71005994e-01, 1.66384503e-01, 1.61247000e-01, 1.52030498e-01, # 1.35238498e-01, 1.11962005e-01, 8.79574940e-02, 4.76383008e-02, # 4.78339800e-01, 6.97819047e-01, 4.60800716e-01, 5.00754069e-01, # -1.00000000e+00, 9.99979496e-01, 8.71338917e-13]) # x_s = np.array([x, x]) # pre_y = pre_model.predict(x_s) # print(x_s[0]) # print(pre_y[0]) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] attack_valid = 1 gap = (i / 10) / 100.0 attack_step = -1 attack_target = 0 for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # os.system("scrot saved_pic/{}.png".format(j)) if j == 80: print("cp attack!") a_t[0][0] = -1.0 if j == 83: os.system("scrot saved_pic/{}.png".format(j)) # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 # if(step == 60): # a_t[0][0] = 1.0 # s_t_scaled = rescale_state(s_t) # # print(s_t[0]) # s_t_0 = restore_state(s_t_scaled) # # print(s_t_0[0]) # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0])) # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)]) # actions = np.array([np.copy(a_t[0]) for val in range(21)]) # for val in range(21): # actions[val][0] = -1.0 + val/10.0 # # print(actions) # x_0 = np.hstack((s_t_scaled_list, actions)) # # print(x_0.shape, s_t_scaled_list.shape, actions.shape) # pre_y = pre_model.predict(x_0) # # print(x_0[0]) # # print(pre_y[0]) # steer_index = int(a_t[0][0]*10.0 + 10.0) # for pre_step in range(2): # restore_new_Y = restore_states(pre_y) # actions = actor.model.predict(restore_new_Y) # x_step1 = np.hstack((pre_y, actions)) # pre_y = pre_model.predict(x_step1) # for index in range(21): # diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index]) # pro = np.random.random() # if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50: # a_t[0][0] = -1.0 + index/10.0 # print("adv!", diff, "pro:", pro) # attack_step = j # attack_target = a_t[0][0] # attack_valid -= 1 # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y]) # print("{:.2f}".format(max(dis_list)*100000)) # print("{}".format(max(dis_list)*100000)) # s_t_scaled = np.copy(s_t1) # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5) # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5) # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7) # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7) # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7) # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0])) # print(actions[0][0]) # ob, r_t, done, info = env.step(new_a_t[0]) ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print(a_t[0][0]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # action_states = [] # for i in range(-5, 6): # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done] # cur_sample.append(cur_step_sample) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 500: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target, i, j, total_reward) attack_valid = 1 attack_step = -1 attack_target = 0 with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file: the_file.write(s) overall_scores.append(total_reward) plt.clf() plt.plot(overall_scores) plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step / 10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.00005 #Learning rate for Actor LRC = 0.0005 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 200000. if train_indicator: episode_count = 1000 else: episode_count = 20 max_steps = 4000 step = 0 if train_indicator: epsilon = 1 else: epsilon = 0 min_laptime = 10000000 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight # loading networks print("Now we load the weight") saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks/") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. # totalLaptime = 0. for j in range(max_steps): loss = 0 if train_indicator: epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, 0.10) a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0], train_indicator) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_predict(new_states, actor.target_predict(new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.train_on_batch(states, actions, y_t) a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 100) == 0: print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime) step += 1 if i == 0: break if done: break # if np.mod(i, 3) == 0: if (train_indicator) and i > 0: if env.lapTime < min_laptime and env.num_lap == 10: min_laptime = env.lapTime print("Now we save model") saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i)) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def main(): """main method log runtime and print it at the end """ s_time = timeit.default_timer() global iteration env = TorcsEnv(vision=False, throttle=True, gear_change=False) memory = ReplayBuffer() epsilon = 1 train_indicator = True modelPATH = os.path.join('.',"models",'E0011.pt') q,q_target = QNet(state_dim,action_dim),QNet(state_dim,action_dim) q_target.load_state_dict(q.state_dict()) mu, mu_target = MuNet(state_dim), MuNet(state_dim) mu_target.load_state_dict(mu.state_dict()) steer_noise = OUN(np.zeros(1),theta = 0.6) accel_noise = OUN(np.zeros(1),theta = 0.6) mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu) q_optimizer = optim.Adam(q.parameters(), lr=lr_q) #tensorboard writer current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = os.path.join("logs", "ddpg_torch", current_time+'E0011t') writer = SummaryWriter(log_dir) samplestate = torch.rand(1,29) sampleaction = torch.rand(1,2) #writer.add_graph(mu,samplestate) writer.add_graph(q,(samplestate,sampleaction)) writer.close if train_indicator ==False: mu = torch.load(modelPATH) mu.eval() ob = env.reset() score = 0 for n_step in range(100000): s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) a_t = mu(torch.from_numpy(s_t.reshape(1,-1)).float()).detach().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t if done: print("score:",score) break env.end() return 0 for n_epi in range(max_episode): print("Episode : " + str(n_epi) + " Replay Buffer " + str(memory.size())) if np.mod(n_epi, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() a_t = np.zeros([1,action_dim]) s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) score = 0 q_value_writer(q, mu, s_t, writer, 'Episode Start Q value') q_value_writer(q_target, mu_target, s_t, writer, 'Episode Start target Q value') #t_start = timeit.default_timer() for n_step in range(max_step): #epsilon -= 1.0/EXPLORE a_origin = mu(torch.from_numpy(s_t.reshape(1,-1)).float()) if train_indicator == True:#add noise for train # sn = max(epsilon,0)*steer_noise() sn = steer_noise() # an = max(epsilon,0)*accel_noise() an = accel_noise() a_s = a_origin.detach().numpy()[0][0] + sn a_t[0][0] = np.clip(a_s,-1,1) # fit in steer arange a_a = a_origin.detach().numpy()[0][1] + an a_t[0][1] = np.clip(a_a,0,1) # fit in accel arange #record noise movement if iteration%10==0: writer.add_scalar('Steer noise', sn, iteration) writer.add_scalar('Accel_noise', an, iteration) else: a_t = a_origin.detatch().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) memory.put((s_t,a_t[0],r_t,s_t1,done)) s_temp = copy.deepcopy(s_t) # for end q value log s_t = s_t1 if train_indicator and memory.size()>train_start_size: train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer,writer) soft_update(mu, mu_target) soft_update(q, q_target) iteration+=1 if done: q_value_writer(q,mu,s_temp,writer,'Episode End Q value') q_value_writer(q_target,mu_target,s_temp,writer,'Episode End target Q value') break #t_end = timeit.default_timer() print("TOTAL REWARD @ " + str(n_epi) +"-th Episode : Reward " + str(score)) print("Total Step: " + str(n_step)) print("") #print('{}steps, {} time spent'.format(i,t_end-t_start)) torch.save(mu,modelPATH) env.end() e_time = timeit.default_timer() print("Total step {} and time spent {}".format(iteration, e_time-s_time))