epsilon = 1 def policy_output_to_stochastic_action(output, action_space): global epsilon output = np.squeeze(output, axis=0) epsilon -= 1.0 / DDPG_CFG.greedy_accel_noise_steps greedy_noise = np.array([ max(epsilon, 0) * greedy_function(output[0], 0.0, 0.60, 0.30), # steer max(epsilon, 0) * greedy_function(output[1], 0.5, 1.00, 0.10), # accel max(epsilon, 0) * greedy_function(output[2], -0.1, 1.00, 0.05) ]) # brake stochastic_action = greedy_noise + output bounded = np.clip(stochastic_action, action_space.low, action_space.high) return bounded if __name__ == "__main__": tf.logging.info( "@@@ start ddpg training gym_torcs @@@ start time:{}".format( time.ctime())) # Generate a Torcs environment env_train = torcs_env_wrapper(vision=False, throttle=True, gear_change=False, port=3101) train(env_train, agent_action, eval_mode=False)
# stochastic_action = output + noise_process.sample() # bound to torcs scope bounded = np.clip(stochastic_action, action_space.low, action_space.high) return bounded if __name__ == "__main__": tf.logging.info( "@@@ start ddpg training gym_torcs @@@ start time:{}".format( time.ctime())) # Generate a Torcs environment env_train = torcs_env_wrapper(vision=False, throttle=True, gear_change=False, port=3101) # env_eval = torcs_env_wrapper(vision=True, throttle=True, gear_change=False,port=8888) #TODO rewrite. #steer, accel, brake .after greedy noise. #valid noise value can make gradients happy. mu = np.array([0, 0, 0]) # x0=np.array([0, 0.5, -0.1]) theta = np.array([0.15, 0.15, 0.15]) sigma = np.array([0.3, 0.3, 0.3]) # x0 = np.array([0.1, 0.3, 0.1]) #TODO start equal exploration on steer, brake, accel. # x0 = np.array([-0.2, 0.0, 0.2]) x0 = np.array([-0.2, -0.2, 0.2]) noise_process = UO_Process(mu=mu, x0=x0, theta=theta, sigma=sigma, dt=1e-2) train(env_train, env_train, agent_action, noise_process) tf.nn.conv2d_transpose()
return bounded if __name__ == "__main__": tf.logging.info( "@@@ start ddpg training gym_bipedal_walker_v2 @@@ start time:{}". format(time.ctime())) # Generate a Torcs environment train_env = gym.make(id='BipedalWalker-v2') eval_monitor = Monitor(gym.make(id='BipedalWalker-v2'), directory=DDPG_CFG.eval_monitor_dir, video_callable=lambda x: False, resume=True) mu = np.array([0.0, 0.0, 0.0, 0.0]) # x0=np.array([0, 0.5, -0.1]) theta = np.array([0.15, 0.15, 0.15, 0.15]) sigma = np.array([0.3, 0.3, 0.3, 0.3]) # x0 = np.array([0.1, 0.3, 0.1]) # TODO greedy accel in the begining x0 = np.array([ -0.2, 0.2, 0.2, 0.2, ]) noise_process = UO_Process(mu=mu, x0=x0, theta=theta, sigma=sigma, dt=1e-2) train(train_env, eval_monitor, agent_action, noise_process)
import time import tensorflow as tf from gym_torcs_train_low_dim import torcs_env_wrapper from low_dim_train.train_agent_low_dim import train DDPG_CFG = tf.app.flags.FLAGS # alias DDPG_CFG.log_dir = 'eval/gym_torcs_low_dim/tf_log/' DDPG_CFG.checkpoint_dir = 'eval/gym_torcs_low_dim/chk_pnt/' DDPG_CFG.eval_monitor_dir = 'eval/gym_torcs_low_dim/eval_monitor/' tf.logging.set_verbosity(tf.logging.INFO) if __name__ == "__main__": tf.logging.info("@@@ start ddpg evaluation gym_torcs @@@ start time:{}".format(time.ctime())) # Generate a Torcs environment env = torcs_env_wrapper(vision=True, throttle=True, gear_change=False) train(env,None,eval_mode=True)