def test_sac(sess, args, sac, saver): saver.restore(sess, 'ckpt/model') env = TorcsEnv(vision=False, throttle=True, gear_change=False) ob = env.reset( relaunch=True ) #relaunch TORCS every N episode because of the memory leak error s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) done = False ep_rew = 0.0 ep_len = 0 while (not done): # deterministic actions at test time a = sac.get_action(s, True) ob, r, done, _ = env.step(a) s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) ep_rew += r ep_len += 1 if (ep_len >= args.max_ep_len): done = True print('test time performance: | rewards: ', ep_rew, ' | length: ', ep_len)
def train_sac(sess, args, sac, saver): env = TorcsEnv(vision=False, throttle=True, gear_change=False) replay_buffer = ReplayBuffer(args.s_dim, args.a_dim, args.buff_size) for ep in range(args.total_ep): if np.mod(ep, 100) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every N episode because of the memory leak error else: ob = env.reset() s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) done = False ep_rew = 0.0 ep_len = 0 while (not done): # first 10 episodes, just step on gas, drive straight if (ep > 10): a = sac.get_action(s) else: a = np.array([0.0, 1.0, 0.0]) ob, r, done, _ = env.step(a) s2 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) ep_rew += r ep_len += 1 if (ep_len >= args.max_ep_len): done = True replay_buffer.store(s, a, r, s2, float(done)) s = s2 batch = replay_buffer.sample_batch(args.batch_size) outs = sac.train(batch) print('episode: ', ep, ' | episode rewards: ', round(ep_rew, 4), ' | episode length: ', ep_len, ' | alpha/temperature: ', outs[9]) with open("performance.txt", "a") as myfile: myfile.write( str(ep) + " " + str(ep_len) + " " + str(round(ep_rew, 4)) + " " + str(round(outs[9], 4)) + "\n") if (ep % 10 == 0): # save model saver.save(sess, 'ckpt/model')
def __init__(self, port=3101): self.loss = None self.name = 'car' self.env = TorcsEnv(vision=False, throttle=False, port=port) ob = self.env.reset(relaunch=False) obs_shape = self.process_obs(ob) self.max_timesteps = 1000 self.demos_loaded = False super(ImitationCar, self).__init__(self.env, dim_action=1, dim_obs=2)
def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.')
def __init__(self): self.n = 4 #self.n2 = 0 self.env = TorcsEnv(vision=False, throttle=True, gear_change=False) self.observation_space = self.env.observation_space # basically this is one agents' action space self.action_space = self.env.action_space self.step_count = 0 self.agent_list = [] #self.agent2_list = [] self.initialize_agents()
def __init__(self, rank, global_net, counter, lock, args): super(A3C_Agent, self).__init__() self.network = A3C_Network(29, 3, lock) self.global_net = global_net self.counter = counter self.lock = lock self.args = args self.done = True self.name = '' self.env = TorcsEnv( port=3101 + rank, path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.reset()
def main(): count = 0 env = TorcsEnv(vision=False, throttle=True, gear_change=False, text=False) # action_fd = open('./trajectory/action15.csv', 'w', newline='') # observation_fd = open('./trajectory/observation15.csv', 'w', newline='') # action_writer = csv.writer(action_fd, delimiter=',') # observation_writer = csv.writer(observation_fd, delimiter=',') for ep in range(1, 5): done = False step, score = 0, 0 if np.mod(ep, 5) == 0: ob = env.reset(relaunch=True) state = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedY, ob.speedX, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) #observation_writer.writerow(state.tolist()) else: ob = env.reset() state = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedY, ob.speedX, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) #observation_writer.writerow(state.tolist()) while not done: keyboard.hook(get_action) time.sleep(0.05) # write action value in action.csv # observation_writer.writerow(state.tolist()) # action_writer.writerow([action[0], action[1], action[2]]) next_ob, reward, done, info = env.step(action) next_state = np.hstack( (next_ob.angle, next_ob.track, next_ob.trackPos, next_ob.speedY, next_ob.speedX, next_ob.speedZ, next_ob.wheelSpinVel / 100.0, next_ob.rpm)) #observation_writer.writerow(state.tolist()) state = next_state # print(step, score) print(next_ob.lastLapTime) score += reward step += 1 action_fd.close() observation_fd.close() sys.exit()
def __init__(self, eval_inst, seed = 10, train_steps=10): super().__init__(self, seed=seed) self.env = TorcsEnv() self.eval_inst = eval_inst self.train_steps = train_steps self.state_size = (128, 128, 1) print('state size', self.state_size) self.action_size = self.env.action_space.n print('action space', self.env.action_space) print('action size', self.env.action_space.n)
def env_setup(joystick_id): #### Generate a Torcs environment # enable vision input, the action is steering only (1 dim continuous action) env = TorcsEnv(vision=True, throttle=False) # without vision input, the action is steering and throttle (2 dim continuous action) # env = TorcsEnv(vision=False, throttle=True) ob = env.reset( relaunch=True) # with torcs relaunch (avoid memory leak bug in torcs) # ob = env.reset() # without torcs relaunch #### Initialize a joystick controller joystick_id = 0 joypad = Controller(joystick_id)
def _thunk(): # env = gym.make(ENV_ID) # env = Pend2.PendulumEnv() # env = AliengoGym.AlienGoEnv(render = False) env = TorcsEnv(vision=True, throttle=True, gear_change=False) return env
def test_policy(track_name, seed): vision = False env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=track_name) nn_agent = NeuralAgent(track_name=track_name) #Now load the weight logging.info("Now we load the weight") try: nn_agent.actor.model.load_weights("./model_1343/actormodel_" + str(seed) + '_' + str(900) + ".h5") nn_agent.critic.model.load_weights("./model_1343/criticmodel_" + str(seed) + '_' + str(900) + ".h5") nn_agent.actor.target_model.load_weights("./model_1343/actormodel_" + str(seed) + '_' + str(900) + ".h5") nn_agent.critic.target_model.load_weights("./model_1343/criticmodel_" + str(seed) + '_' + str(900) + ".h5") logging.info("Weight load successfully") except: logging.info("Cannot find the weight") nn_agent.rollout(env) return None
def run(): """Build networks, create environment and train agent.""" # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False) with tf.Session() as sess: np.random.seed(args['seed']) tf.set_random_seed(args['seed']) # Actor and actor target n_params = 0 actor = ActorNetwork(sess=sess, scope='actor_net', state_size=args['state_size'], action_size=args['action_size'], batch_size=args['batch_size'], lr=args['actor_lr'], n_params=n_params) n_params += actor.get_num_params() actor_target = ActorNetwork(sess=sess, scope='actor_net_target', state_size=args['state_size'], action_size=args['action_size'], batch_size=args['batch_size'], lr=args['actor_lr'], n_params=n_params) # Critic and critic target n_params += actor_target.get_num_params() critic = CriticNetwork(sess=sess, scope='critic_net', state_size=args['state_size'], action_size=args['action_size'], lr=args['critic_lr'], n_params=n_params) n_params += critic.get_num_params() critic_target = CriticNetwork(sess=sess, scope='critic_net_target', state_size=args['state_size'], action_size=args['action_size'], lr=args['critic_lr'], n_params=n_params) # Restore network params saver = tf.train.Saver() saver.restore(sess, os.path.join(os.path.join(args['resources'], "network"), args['file']+'_model')) # Train DDPG on Torcs test(sess, env, actor, actor_target, critic, critic_target)
def playGame(finetune=0): demo_dir = "/home/mathew/Documents/RL/human_0/" param_dir = "/home/mathew/Documents/RL/wgail_info_params_0/" pre_actions_path = "/home/mathew/Documents/RL/human_0/pre_actions.npz" feat_dim = [7, 13, 1024] img_dim = [50, 50, 3] aux_dim = 10 #encode_dim = 2 #encode_dim is 2 in the case of pass and turn, 4 in the case of pass and turn in single trajectory encode_dim = 4 action_dim = 3 np.random.seed(1024) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # initialize the env env = TorcsEnv(throttle=True, gear_change=False) # define the model pre_actions = np.load(pre_actions_path)["actions"] agent = TRPOAgent(env, sess, feat_dim, aux_dim, encode_dim, action_dim, img_dim, pre_actions) # Load expert (state, action) pairs demo = np.load(demo_dir + "demo.npz") # Now load the weight print("Now we load the weight") try: if finetune: agent.generator.load_weights(param_dir + "params_0/generator_model_37.h5") agent.discriminator.load_weights( param_dir + "params_0/discriminator_model_37.h5") agent.baseline.model.load_weights(param_dir + "params_0/baseline_model_37.h5") agent.posterior.load_weights(param_dir + "params_0/posterior_model_37.h5") agent.posterior_target.load_weights( param_dir + "params_0/posterior_target_model_37.h5") else: agent.generator.load_weights( param_dir + "params_bc/params_3/generator_bc_model.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") agent.learn(demo) print("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 4 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=False,gear_change=False) nb_actions = 3 # left, nothing , right, break model = Sequential() model.add(Flatten(input_shape=(window_length,29))) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) print(model.summary()) memory = SequentialMemory(limit=1000000, window_length=window_length) policy = BoltzmannQPolicy(tau=1.) processor=MyProcessor() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy, processor=processor) dqn.compile(RMSprop(lr=1e-3), metrics=['mae']) dqn.load_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME)) dqn.fit(env, nb_steps=500000, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=False)
def testDDPG(sess, args, actor, critic): # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False) episode_count = args['episode_count'] max_steps = args['max_steps'] for i in range(episode_count): if np.mod(i, 100) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every N episode because of the memory leak error else: ob = env.reset() s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) ep_reward = 0 ep_ave_max_q = 0 for j in range(max_steps): a = actor.predict(np.reshape(s, (1, actor.s_dim))) # NOISE AT TEST TIME MAY BE REQUIRED TO STABILIZE ACTIONS a[0,:] += OU(x=a[0,:], mu=mu, sigma=sigma, theta=theta) ob, r, terminal, info = env.step(a[0]) s2 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) s = s2 ep_reward += r if terminal: print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
def main(args): args.seed = 0 random.seed(args.seed) np.random.seed(args.seed) if args.task == "DuskDrive": env = gym.make('flashgames.DuskDrive-v0') env.configure(remotes=1) elif args.task == "Torcs": from gym_torcs import TorcsEnv env = TorcsEnv(vision=True, throttle=False) elif args.task == "Torcs_novision": from gym_torcs import TorcsEnv env = TorcsEnv(vision=False, throttle=False) sess = get_session(str(args.gpu)) env = setup(env, args) collect(env, sess, args)
def sample_one(self): """ MODIFIED SAMPLING FOR TORCS! """ print print('START PLOTTING MODULE'.center(80, '=')) roll_distance = [] print print("TORCS Experiment Start".center(80, '=')) env = TorcsEnv(vision=self.config.vision, throttle=self.config.throttle) try: ob = env.reset() sonar, grayscale = self.image_to_sonar(ob.img) sonar = np.reshape(sonar, [19]) state = np.concatenate( [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0) obs, states, actions, rewards,sonars,grayscales = [], [], [], [],[],[] done = False #has the episode ended? start_time = time.time() while not done and (time.time() - start_time < 300): states.append(state) obs.append(ob) sonars.append(sonar) grayscales.append(grayscale) state = np.concatenate( [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0) action = self.sess.run( self.sampled_action, feed_dict={ self.observation_placeholder: np.reshape(state, [1, self.observation_dim]) })[0] ob, reward, done, info = env.step(action) sonar, grayscale = self.image_to_sonar(ob.img) sonar = np.reshape(sonar, [19]) #print('Action: ', action) actions.append(action) rewards.append(reward) roll_distance.append(env.distance_travelled) #print('Roll distance: ', roll_distance) except: raise finally: env.end() # This is for shutting down TORCS print("Finished TORCS session".center(80, '=')) print('Final distance: ', roll_distance[-1], ' [m]') print('END PLOTTING MODULE'.center(80, '=')) #Plot some of the frames: self.grayscales = grayscales self.sonars = sonars self.obs = obs self.actions = actions self.roll_distance = roll_distance return
def main(): ppo = PPOAgent() env = TorcsEnv(text=True, vision=False, throttle=True, gear_change=False) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, './save_model13/models/model_59557.ckpt') for i in range(1000): obs = env.reset() state = convert_obs(obs) score, step = 0, 0 while True: action = ppo.choose_action(state) next_obs, reward, done, _ = env.step(action) time.sleep(0.05) next_state = convert_obs(next_obs) score += reward step += 1 if done: print(step, score) env.reset() step, score = 0, 0 else: state = next_state
def retrain(self): for episode in range(n_episode): self.env = TorcsEnv(vision=True, throttle=False) ob = self.env.reset(relaunch=True) reward_sum = 0 i = 0 print("# Episode: %d start" % episode) for i in range(steps): act = self.model.predict(self.img_reshape(ob.img/255)) ob, reward, done, _ = self.env.step(act) if done is True: break else: self.D.append([self.img_reshape(ob.img/255), act, np.array([reward])]) reward_sum += reward print("# step: %d reward: %f " % (i, reward_sum)) self.env.end() if i == (steps-1): break self.train() self.save()
def play_trajectory(filepath,counts): # Defining torcs environment env = TorcsEnv(vision=VISION,throttle=True,gear_change=False) states,actions = load_expert_trajectory(filepath) for i in range(counts): print("Playing count : {}".format(i)) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() state = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) for action in actions: # take action and observe reward and next state ob, reward, done, info = env.step(action) if done: break
def main(): # Creating necessary directories track_no = 5 experiment_name = "tensorboard-4" experiment_dir = "experiment-%s/" % experiment_name datas_dir = experiment_dir + "datas-track-no-%d/" % track_no models_dir = datas_dir + "model/" if os.path.exists(experiment_dir) == False: print("%s dosen't exists" % experiment_dir) return if os.path.exists(datas_dir) == False: print("%s dosen't exists" % datas_dir) return if os.path.exists(models_dir) == False: print("%s dosen't exists" % models_dir) return state_dim = 4 img_dim = [304, 412, 3] sess = tf.InteractiveSession() agent = Supervise(sess, state_dim, img_dim, models_dir) agent.load_network() MAX_STEP = 10000 step = 0 vision = True env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=track_no, random_track=False, track_range=(5, 8)) for i in range(1): if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, 0.0)) i_t = ob.img # print(i_t) while step < MAX_STEP: action = agent.action(s_t, i_t) ob, reward, done, info = env.step([action, 0.16, 0]) s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, action)) i_t = ob.img print("Step", step, "Action", action, "Reward", reward) if done == True: break env.end()
def collectData(self): # collect state-action pairs of imitation learning # self.images_all = np.zeros((0, img_dim[0], img_dim[1], img_dim[2])) # self.actions_all = np.zeros((0, n_action)) # self.rewards_all = np.zeros((0,)) # img_list = [] # action_list = [] # reward_list = [] self.env = TorcsEnv(vision=True, throttle=False) ob = self.env.reset(relaunch=True) print('Collecting data from expert ... ') for i in range(steps): if i == 0: act = np.array([0.0]) else: act = self.get_teacher_action(ob) print("act %f" % act) if i % 100 == 0: print("step:", i) ob, reward, done, _ = self.env.step(act) # img_list.append(ob.img/255) # normanize RGB value to [0,1] # action_list.append(act) # reward_list.append(np.array([reward])) if i % 10 == 0: self.save_img(ob.img, i) self.D.append([self.img_reshape(ob.img/255), act, np.array([reward])]) if len(self.D) > memory: self.D.popleft() print("step: %d" % steps) self.env.end() '''
def main(): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) generator = Generator(sess, feat_dim, aux_dim, encode_dim, action_dim) base_model = ResNet50(weights='imagenet', include_top=False) feat_extractor = Model( input=base_model.input, output=base_model.get_layer('activation_40').output ) try: generator.model.load_weights(param_path) print("Weight load successfully") except: print("cannot find weight") env = TorcsEnv(throttle=True, gear_change=False) print("Start driving ...") ob = env.reset(relaunch=True) feat, aux = get_state(ob, aux_dim, feat_extractor) encode = np.zeros((1, encode_dim), dtype=np.float32) encode[0, code] = 1 print "Encode:", encode[0] pre_actions = np.load(pre_actions_path)["actions"] for i in xrange(MAX_STEP_LIMIT): if i < MIN_STEP_LIMIT: action = np.zeros(3, dtype=np.float32) elif i < MIN_STEP_LIMIT + PRE_STEP: action = pre_actions[i - MIN_STEP_LIMIT] else: action = generator.model.predict([feat, aux, encode])[0] ob, reward, done, _ = env.step(action) feat, aux = get_state(ob, aux_dim, feat_extractor) if i == MIN_STEP_LIMIT + PRE_STEP: print "Start deciding ..." print "Step:", i, "DistFromStart:", ob.distFromStart, \ "TrackPos:", ob.trackPos, "Damage:", ob.damage.item(), \ "Action: %.6f %.6f %.6f" % (action[0], action[1], action[2]), \ "Speed:", ob.speedX * 200 if done: break env.end() print("Finish.")
def train(device): # hyper-parameters coeff_entropy = 0.00001 lr = 5e-4 mini_batch_size = 64 horizon = 2048 nupdates = 10 nepoch = 5000 clip_value = 0.2 train = True render = False # initialize env env = TorcsEnv(port=3101, path="/usr/local/share/games/torcs/config/raceman/quickrace.xml") insize = env.observation_space.shape[0] outsize = env.action_space.shape[0] policy = MLPPolicy(insize, action_space = outsize) policy.to(device) if os.path.exists('policy.pth'): policy.load_state_dict(torch.load('policy.pth', map_location = device)) print('Loading complete!') if train: optimizer = Adam(lr=lr, params=policy.parameters()) mse = MSELoss() # start training for e in range(nepoch): # generate trajectories relaunch = e%100 == 0 observations, actions, logprobs, returns, values, rewards = \ generate_trajectory(env, policy, horizon, is_render=render, obs_fn=None, progress=True, device=device, is_relaunch = relaunch) print('Episode %s reward is %s' % (e, rewards.sum())) memory = (observations, actions, logprobs, returns[:-1], values) # update using ppo policy_loss, value_loss, dist_entropy =\ ppo_update( policy, optimizer, mini_batch_size, memory, nupdates, coeff_entropy=coeff_entropy, clip_value=clip_value, device=device ) print('\nEpisode: {}'.format(e)) print('Total reward {}'.format(rewards.sum())) print('Entropy', dist_entropy) print('Policy loss', policy_loss) print('Value loss', value_loss) if np.mod(e+1, 10) == 0: print("saving model") torch.save(policy.state_dict(), 'policy.pth')
def run(self): ### create TORCS environment env = TorcsEnv(vision=False, throttle=True) ### start run according to supplied arguments if self.algorithm == "dqn" and self.modus == "train": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "dqn" and self.modus == "test": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.testAgent() elif self.algorithm == "ddpg" and self.modus == "train": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "ddpg" and self.modus == "test": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.testAgent()
def __init__(self): self.critic = self.build_critic() if CONTINUOUS is False: self.actor = self.build_actor() else: self.actor = self.build_actor_continuous() self.env = env = TorcsEnv(vision=True, throttle=False, gear_change=False) print(self.env.action_space, 'action_space', self.env.observation_space, 'observation_space') self.episode = 0 self.observation = self.env.reset() self.val = False self.reward = [] self.reward_over_time = [] self.name = self.get_name() self.writer = SummaryWriter(self.name) self.gradient_steps = 0
def play_training(training=True, load_model=True): with tf.device("/cpu:0"): global_episodes = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) # trainer = tf.train.RMSPropOptimizer(learning_rate=1e-4, decay=0.99, epsilon=1) trainer = tf.train.AdamOptimizer(learning_rate=1e-4) master_network = AC_Network(s_size, a_size, 'global', None, False) if training: #num_workers = multiprocessing.cpu_count() # Set workers at number of available CPU threads num_workers = 4 else: num_workers = 1 workers = [] for i in range(num_workers): workers.append( Worker( TorcsEnv(vision=True, throttle=False, gear_change=False, port=3101 + i), i, s_size, a_size, trainer, model_path, global_episodes, False)) saver = tf.train.Saver() with tf.Session() as sess: coord = tf.train.Coordinator() initialize_variables(saver, sess, load_model) # Asynchronous magic happens: start the "work" process for each worker in a separate thread. worker_threads = [] for worker in workers: worker_work = lambda: worker.work(max_episode_length, gamma, sess, coord, saver, training) t = threading.Thread(target=worker_work) t.start() sleep(0.5) worker_threads.append(t) coord.join( worker_threads) # waits until the specified threads have stopped.
def test(): env = TorcsEnv(vision=True, throttle=False) ob = env.reset(relaunch=True) reward_sum = 0.0 done = False count = 0 while not done: act = model.predict(img_reshape(ob.img).astype('float32') / 255) #print(act) count += 1 ob, reward, done, _ = env.step(act) reward_sum += reward env.end() print("Steps before crash: ", count, reward_sum) return count, reward_sum
class A3C_Agent(Process): def __init__(self, rank, global_net, counter, lock, args): super(A3C_Agent, self).__init__() self.network = A3C_Network(29, 3, lock) self.global_net = global_net self.counter = counter self.lock = lock self.args = args self.done = True self.name = '' self.env = TorcsEnv( port=3101 + rank, path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.reset() def reset(self, relaunch=None): if relaunch is None: relaunch = self.done # Synchronizing self.time_step = self.counter.value() self.network.reset(self.global_net, self.done) if self.done: self.state = self.env.reset(relaunch=relaunch, sampletrack=True, render=False) self.values = [] self.rewards = [] self.log_probs = [] self.entropies = [] def normalize_state(self): def normalize_from(x, a, b): return (2 * ((x - a) / (b - a))) - 1 self.state[1:20] = normalize_from(self.state[1:20], -0.01, 2) self.state[21:] = normalize_from(self.state[21:], 0, 1) def internal_log(self, *msg): with open(f'../../logs/{self.name}.txt', 'a+') as file: print(*msg, file=file)
def programmatic_game(tree_program, track_name='practgt2.xml'): episode_count = 2 max_steps = 100000 window = 5 # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False, track_name=track_name) logging.info("TORCS Experiment Start with Priors on " + track_name) for i_episode in range(episode_count): ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] newobs = [item for sublist in tempObs[:-1] for item in sublist] for j in range(max_steps): act_tree = tree_program.predict([newobs]) action_prior = [act_tree[0][0], act_tree[0][1], act_tree[0][2]] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] newobs = [item for sublist in tempObs[:-1] for item in sublist] ob, r_t, done, info = env.step(action_prior) if np.mod(j, 1000) == 0: logging.info("Episode " + str(i_episode) + " Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) if done: print('Done. Steps: ', j) break env.end() # This is for shutting down TORCS logging.info("Finish.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1,action_dim]) noise_t = np.zeros([1,action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 30) == 0: print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
from gym_torcs import TorcsEnv from sample_agent import Agent import numpy as np vision = True episode_count = 10 max_steps = 50 reward = 0 done = False step = 0 # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=False) agent = Agent(1) # steering only print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i)) if np.mod(i, 3) == 0: # Sometimes you need to relaunch TORCS because of the memory leak error ob = env.reset(relaunch=True) else: ob = env.reset() total_reward = 0. for j in range(max_steps): action = agent.act(ob, reward, done, vision)
# critic_model = load_model('critic_model_{}.h5'.format(ITERATIONS)) # update_op, action_gradient_holder = get_actor_update_operation(actor_model) # gradient_op = get_gradient_operation(critic_model) # else: import sys sys.path.append(os.path.abspath('./gym_torcs')) print(sys.path) from gym_torcs import TorcsEnv #### Generate a Torcs environment # enable vision input, the action is steering only (1 dim continuous action) #env = TorcsEnv(vision=False, throttle=False) # without vision input, the action is steering and throttle (2 dim continuous action) env = TorcsEnv(vision=False, throttle=True) # ob = env.reset() # without torcs relaunch # Generate an agents actor_model = basic_actor_model() critic_model = basic_critic_model() if TARGET_MODEL: target_actor_model = basic_actor_model() target_critic_model = basic_critic_model() else: target_actor_model = None target_critic_model = None update_op, action_gradient_holder = get_actor_update_operation(actor_model)