def run(): """Build networks, create environment and train agent.""" # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False) with tf.Session() as sess: np.random.seed(args['seed']) tf.set_random_seed(args['seed']) # Actor and actor target n_params = 0 actor = ActorNetwork(sess=sess, scope='actor_net', state_size=args['state_size'], action_size=args['action_size'], batch_size=args['batch_size'], lr=args['actor_lr'], n_params=n_params) n_params += actor.get_num_params() actor_target = ActorNetwork(sess=sess, scope='actor_net_target', state_size=args['state_size'], action_size=args['action_size'], batch_size=args['batch_size'], lr=args['actor_lr'], n_params=n_params) # Critic and critic target n_params += actor_target.get_num_params() critic = CriticNetwork(sess=sess, scope='critic_net', state_size=args['state_size'], action_size=args['action_size'], lr=args['critic_lr'], n_params=n_params) n_params += critic.get_num_params() critic_target = CriticNetwork(sess=sess, scope='critic_net_target', state_size=args['state_size'], action_size=args['action_size'], lr=args['critic_lr'], n_params=n_params) # Restore network params saver = tf.train.Saver() saver.restore(sess, os.path.join(os.path.join(args['resources'], "network"), args['file']+'_model')) # Train DDPG on Torcs test(sess, env, actor, actor_target, critic, critic_target)
def start(env, **config): # Initialize the actor, critic and difference networks with tf.Graph().as_default() as ddpg: # setup random number generators for predicatbility print("Random seed ", config['seed']) random.seed(config['seed']) np.random.seed(random.randint(0, 10000)) tf.set_random_seed(random.randint(0, 10000)) env.seed(random.randint(0, 10000)) obs_dim = env.observation_space.shape[-1] act_dim = env.action_space.shape[-1] actor = ActorNetwork(obs_dim, act_dim, 1, config) critic = CriticNetwork(obs_dim, act_dim, config, actor.get_num_trainable_vars()) if config["tensorboard"] == True: dir_path = os.path.dirname(os.path.realpath(__file__)) tf.summary.FileWriter(dir_path, ddpg) with tf.Graph().as_default() as balancing_graph: balancing_actor = ActorNetwork(obs_dim, act_dim, 1, config) balancing_critic = CriticNetwork( obs_dim, act_dim, config, balancing_actor.get_num_trainable_vars()) print(actor.target_inputs.graph is tf.get_default_graph()) print(balancing_actor.target_inputs.graph is tf.get_default_graph()) print(balancing_actor.target_inputs.graph is actor.target_inputs.graph) train(env, ddpg, actor, critic, balancing_graph, balancing_actor, **config)
def __init__(self, env, sess): self.env = env self.sess = sess self.memory_buffer = ReplayMemory(BUFFER_SIZE, 20000, BATCH_SIZE, env.observation_space.shape, env.action_space.shape) self.learning_rate = LR self.tau = TAU self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.discount = 0.99 self.states_ph = tf.placeholder( tf.float32, shape=((None, ) + self.env.observation_space.shape)) self.actions_ph = tf.placeholder(tf.float32, shape=((None, ) + self.env.action_space.shape)) self.is_training_ph = tf.placeholder_with_default(True, shape=None) self.Actor = ActorNetwork(env=self.env, states=self.states_ph, LR=self.learning_rate, TAU=self.tau, discount=self.discount, scope="actor_main", batch_size=self.batch_size, is_training=self.is_training_ph) self.Critic = CriticNetwork(env=self.env, states=self.states_ph, actions=self.actions_ph, LR=self.learning_rate, TAU=self.tau, discount=self.discount, scope="critic_main", batch_size=self.batch_size, is_training=self.is_training_ph) self.Actor_target = ActorNetwork(env=self.env, states=self.states_ph, LR=0.001, TAU=self.tau, discount=self.discount, scope="actor_target", batch_size=self.batch_size, is_training=self.is_training_ph) self.Critic_target = CriticNetwork(env=self.env, states=self.states_ph, actions=self.actions_ph, LR=self.learning_rate, TAU=self.tau, discount=self.discount, scope="critic_target", batch_size=self.batch_size, is_training=self.is_training_ph)
def start(env, pt=None, cl_mode=None, norm_complexity=0, **config): # block warnings from tf.saver if needed if config['mp_debug']: tf.logging.set_verbosity(tf.logging.ERROR) # Initialize the actor, critic and difference networks with tf.Graph().as_default() as ddpg: # setup random number generators for predicatbility print("Random seed ", config['seed']) random.seed(config['seed']) np.random.seed(random.randint(0, 1000000)) tf.set_random_seed(random.randint(0, 1000000)) env.seed(random.randint(0, 1000000)) print("Random seed verification (numpy) ", np.random.randint(10000)) obs_dim = env.observation_space.shape[-1] act_dim = env.action_space.shape[-1] actor = ActorNetwork(obs_dim, act_dim, 1, config) critic = CriticNetwork(obs_dim, act_dim, config, actor.get_num_trainable_vars()) if config["tensorboard"] == True: dir_path = os.path.dirname(os.path.realpath(__file__)) tf.summary.FileWriter(dir_path, ddpg) # create curriculum switching network if not config["cl_structure"] or not config["cl_stages"]: cl_nn = None else: cl_nn = CurriculumNetwork(pt.get_v_size(), config, cl_mode) if config["compare_with"]: with tf.Graph().as_default() as compare_with_graph: compare_with_actor = ActorNetwork(obs_dim, act_dim, 1, config) CriticNetwork(obs_dim, act_dim, config, compare_with_actor.get_num_trainable_vars()) print(actor.target_inputs.graph is tf.get_default_graph()) print(compare_with_actor.target_inputs.graph is tf.get_default_graph()) print( compare_with_actor.target_inputs.graph is actor.target_inputs.graph ) return compare(env, ddpg, actor, critic, compare_with_graph, compare_with_actor, cl_nn, pt, cl_mode, **config) return train(env, ddpg, actor, critic, cl_nn, pt, cl_mode, norm_complexity, **config)
def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) # np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, action_bound, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) if GYM_MONITOR_EN: if not RENDER_ENV: env = Monitor(env, MONITOR_DIR, video_callable=False, force=True) else: env = Monitor(env, MONITOR_DIR, force=True) train(sess, env, actor, critic)
def main(_): with tf.compat.v1.Session() as sess: env = StageWorld(LASER_BEAM, map_type) np.random.seed(RANDOM_SEED) tf.compat.v1.set_random_seed(RANDOM_SEED) state_dim = LASER_BEAM * LASER_HIST + SPEED + TARGET action_dim = ACTION #action_bound = [0.25, np.pi/6] #bounded acceleration action_bound = [0.5, np.pi / 3] #bounded velocity switch_dim = SWITCH discrete = False print('Continuous Action Space') actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, switch_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) noise = Noise(DELTA, SIGMA, OU_A, OU_MU) reward = Reward(REWARD_FACTOR, GAMMA) try: train(sess, env, actor, critic, noise, reward, discrete, action_bound) except KeyboardInterrupt: pass
def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) print(env.observation_space) print(env.action_space) state_dim = env.observation_space.shape[0] try: action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) discrete = False print('Continuous Action Space') except AttributeError: action_dim = env.action_space.n action_bound = 1 discrete = True print('Discrete Action Space') actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) noise = Noise(DELTA, SIGMA, OU_A, OU_MU) reward = Reward(REWARD_FACTOR, GAMMA)
def start(args, counter=None): # Initialize the actor, critic and difference networks conf = open_config_file(args) if "learning_rate" in conf['experiment']: critic_learning_rate = conf["experiment"]["learning_rate"] actor_learning_rate = critic_learning_rate / 10 if "learning_rate" in conf['experiment']: counter = conf["experiment"]["difference_model"] print actor_learning_rate, critic_learning_rate with tf.Graph().as_default() as ddpg: actor = ActorNetwork(OBSERVATION_DIMS, ACTION_DIMS, 1, actor_learning_rate, TAU) critic = CriticNetwork(OBSERVATION_DIMS, ACTION_DIMS, critic_learning_rate, TAU, actor.get_num_trainable_vars()) if counter: with tf.Graph().as_default() as diff_model: model = DifferenceModel(STATE_DIMS + ACTION_DIMS, STATE_DIMS) train(args, ddpg, actor, critic, counter=counter, diff_model=diff_model, model=model) else: train(args, ddpg, actor, critic)
def __init__(self, env, args): self.env = env self.memory_buffer = ReplayBuffer(args.buffer_size) self.learning_rate_actor = args.lr_actor self.learning_rate_critic = args.lr_critic self.tau = args.TAU self.batch_size = args.batch_size self.discount = args.discount self.states_ph = tf.placeholder(tf.float32, shape=(None, 1)) self.actions_ph = tf.placeholder(tf.float32, shape=((None, ) + self.env.action_space.shape)) self.is_training_ph = tf.placeholder_with_default(True, shape=None) self.Actor = ActorNetwork(env=self.env, states=self.states_ph, LR=self.learning_rate_actor, TAU=self.tau, discount=self.discount, scope="actor_main", batch_size=self.batch_size, is_training=self.is_training_ph) self.Critic = CriticNetwork(env=self.env, states=self.states_ph, actions=self.actions_ph, LR=self.learning_rate_critic, TAU=self.tau, discount=self.discount, scope="critic_main", batch_size=self.batch_size, is_training=self.is_training_ph) self.Actor_target = ActorNetwork(env=self.env, states=self.states_ph, LR=self.learning_rate_actor, TAU=self.tau, discount=self.discount, scope="actor_target", batch_size=self.batch_size, is_training=self.is_training_ph) self.Critic_target = CriticNetwork(env=self.env, states=self.states_ph, actions=self.actions_ph, LR=self.learning_rate_critic, TAU=self.tau, discount=self.discount, scope="critic_target", batch_size=self.batch_size, is_training=self.is_training_ph)
def __init__(self, state_size, batch_size, is_eval=False): self.state_size = state_size # normalized previous days self.action_size = 3 # sit, buy, sell self.memory_size = 1000000 # Replay memory size self.batch_size = batch_size self.replay_memory = ExperienceReplay(self.memory_size, self.batch_size) self.inventory = [] self.is_eval = is_eval # Whether or not Training is ongoing self.gamma = 0.99 # Discount factor in Bellman equation # Actor Policy model mapping states to actions self.actor = ActorNetwork( self.state_size, self.action_size) # Instantiates the Actor networks # Critic(Value) Model that maps state action pairs to Q values. self.critic = CriticNetwork( self.state_size, self.action_size) # Instantiate the critic model
def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) print(env.observation_space) print(env.action_space) state_dim = env.observation_space.shape[0] try: action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) discrete = False print('Continuous Action Space') except: #原来的对象抛出处理不了这里的异常,此处更换为全部处理 action_dim = env.action_space.n action_bound = 1 discrete = True print('Discrete Action Space') actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) noise = Noise(DELTA, SIGMA, OU_A, OU_MU) reward = Reward(REWARD_FACTOR, GAMMA) if GYM_MONITOR_EN: if not RENDER_ENV: gym.wrappers.Monitor(env, MONITOR_DIR, video_callable=False, force=True) #此处更换为新版本 # env.monitor.start(MONITOR_DIR, video_callable=False, force=True) else: gym.wrappers.Monitor(env, force=True) #此处更换为新版本 # env.monitor.start(MONITOR_DIR, force=True) try: train(sess, env, actor, critic, noise, reward, discrete) except KeyboardInterrupt: pass if GYM_MONITOR_EN: env.monitor.close()
def __init__(self, sess, is_train, dim_state, dim_action, num_paths, actor_learn_rate, critic_learn_rate, tau, buffer_size, mini_batch, ep_begin, epsilon_end, gamma, max_epoch, seed=66): self.__is_train = is_train self.__dim_state = dim_state self.__dim_action = dim_action self.__mini_batch = mini_batch self.__ep_begin = ep_begin self.__gamma = gamma self.__max_epoch = max_epoch self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0, actor_learn_rate, tau, num_paths) self.__critic = CriticNetwork(sess, dim_state, dim_action, critic_learn_rate, tau) self.__replay = ReplayBuffer(buffer_size, seed) self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch, dim_action, num_paths, seed) self.__state_curt = np.zeros(dim_state) self.__action_curt = self.__explorer.convert_action( np.ones(dim_action)) self.__episode = 0 self.__step = 0
def __init__(self, task, sess): self.sess = sess self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.tau = 0.001 self.minibatch_size = 64 self.critic_lr = 0.001 self.gamma = 0.99 self.buffer_size = 1000000 self.random_seed = 1234 self.summary_dir = "/" #self.max_episode = 100 #self.max_episode_len = 100 self.mu = 0 self.actor = ActorNetwork(self.sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.tau, self.minibatch_size) self.critic = CriticNetwork(self.sess, self.state_size, self.action_size, self.critic_lr, self.tau, self.gamma, self.actor.get_num_trainable_vars()) # Initialize replay memory self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed) self.sess.run(tf.global_variables_initializer()) self.actor.update_target_network() self.critic.update_target_network() self.noise = OUNoise(self.action_size, self.mu) self.sess.run(tf.global_variables_initializer())
def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) print env.observation_space print env.action_space state_dim = env.observation_space.shape[0] try: action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) discrete = False print "Continuous Action Space" except AttributeError: action_dim = env.action_space.n action_bound = 1 discrete = True print "Discrete Action Space" actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) noise = Noise(DELTA, SIGMA, OU_A, OU_MU) reward = Reward(REWARD_FACTOR, GAMMA) if GYM_MONITOR_EN: if not RENDER_ENV: env = wrappers.Monitor(env, MONITOR_DIR, force=True) else: env = wrappers.Monitor(env, MONITOR_DIR, force=True) try: train(sess, env, actor, critic, noise, reward, discrete) except KeyboardInterrupt: pass #if GYM_MONITOR_EN: #env.monitor.close() env.close() gym.upload(MONITOR_DIR, api_key="sk_JObiOSHpRjw48FpWvI1GA")
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.Session() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.sess.run(tf.global_variables_initializer()) #target_param <- eval_param self.actor_network.update_target() self.critic_network.update_target()
def main(args): now = datetime.utcnow().strftime("%b_%d_%H_%M_%S") monitor_dir = os.path.join('videos', args['env'], "no-of-update_" + args["no_of_updates"], "random_seed" + str(args["random_seed"])) logger = Logger(logname=args['env'], args=args, now=now) with tf.Session() as sess: env = gym.make(args['env']) monitor_env = gym.make(args['env']) np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) monitor_env.seed(int(args['random_seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high print("****** state dimension", state_dim) print("****** actions dimension", action_dim) print("****** actions high bound", action_bound) # Ensure action bound is symmetric assert (np.array_equal(env.action_space.high, -env.action_space.low)) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) if args['use_gym_monitor']: monitor_env = wrappers.Monitor(monitor_env, monitor_dir, force=True) train(sess, env, args, actor, critic, actor_noise, logger, monitor_env) logger.close() if args['use_gym_monitor']: env.monitor.close() monitor_env.monitor.close()
def main(): tf.reset_default_graph() with tf.Session() as sess: env = Env(players=5) np.random.seed(1) tf.set_random_seed(1) state_dim = env.observation_space action_dim = env.action_space # Ensure action bound is symmetric crlr = 0.001 aclr = 0.001 tau = 0.001 actor = ActorNetwork(sess, state_dim, action_dim, 15, 32, tau, aclr) critic = CriticNetwork(sess, state_dim, action_dim, 15, 32, tau, crlr) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros((env.players, action_dim))) train(sess, env, actor, critic, actor_noise)
def main(_): t1 = time.time() # Training the model with tf.Session() as sess: env = PowerSystem() # System Info state_dim = 20 # We only consider the Current of all line as state at this moment action_dim = 4 # The number of generators action_bound = np.array([[-1, 1], [-0.675, 0.675]]) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) saver = tf.train.Saver() noise = Noise(DELTA, SIGMA, OU_A, OU_MU) # Training the model train(sess, env, actor, critic, noise, action_bound) # # save the variables save_path = saver.save(sess, model_path) # print("[+] Model saved in file: %s" % save_path) # # Testing the model # with tf.Session() as sess: # # env = PowerSystem() # # System Info # state_dim = 11 # We only consider the Current of all line as state at this moment # action_dim = 2 # The number of generators # action_bound = np.array([[-1, 1], [-0.675, 0.675]]) # # actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) # saver = tf.train.Saver() # load_path = saver.restore(sess, model_path) # test(env, actor) print('Running time: {} minutes.'.format((time.time() - t1) / 60))
def start(args, counter=None): # Initialize the actor, critic and difference networks with tf.Graph().as_default() as ddpg: actor = ActorNetwork(OBSERVATION_DIMS, ACTION_DIMS, 1, global_params.actor_learning_rate, TAU) critic = CriticNetwork(OBSERVATION_DIMS, ACTION_DIMS, global_params.critic_learning_rate, TAU, actor.get_num_trainable_vars()) if counter: with tf.Graph().as_default() as diff_model: model = DifferenceModel(STATE_DIMS + ACTION_DIMS, STATE_DIMS) train(args, ddpg, actor, critic, counter=counter, diff_model=diff_model, model=model) else: train(args, ddpg, actor, critic)
def main(_): # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: with tf.Session() as sess: env = StageWorld(LASER_BEAM) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) state_dim = LASER_BEAM * LASER_HIST + SPEED + TARGET action_dim = ACTION action_bound = [0.5, np.pi / 3] switch_dim = SWITCH discrete = False print('Continuous Action Space') with tf.name_scope("Actor"): actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) with tf.name_scope("Critic"): critic = CriticNetwork(sess, state_dim, action_dim, switch_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), baseline_rate=10., control_variance_flag=CONTROL_VARIANCE) noise = Noise(DELTA, SIGMA, OU_A, OU_MU) reward = Reward(REWARD_FACTOR, GAMMA) try: train(sess, env, actor, critic, noise, reward, discrete, action_bound) except KeyboardInterrupt: pass
def actor_critic(epochs=1000, GAMMA=0.99, load_file=False, render=False, temp=False, verbose=False): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear #robot = gym_pendulum(render, temp) robot = gym_mountaincar(render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE, ACTION_BOUND, device=DEVICE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars(), device=DEVICE) # starting tensorflow sess.run(tf.global_variables_initializer()) if load_file: actor.recover_actor() critic.recover_critic() for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 while (not done): # Choose and take action, and observe reward action, mu, sigma = actor.predict( np.reshape(state, (1, robot.state_dim))) new_action = action + 0.2 * (np.random.rand(1)[0]) action_noise = np.clip(new_action, -ACTION_BOUND, ACTION_BOUND) # print(round(action,3), round(new_action,3), round(action_noise,3), round(mu,3), round(sigma,3)) next_state, reward, done, step = robot.update(action_noise) # Train V_minib = critic.predict( np.reshape(state, (1, robot.state_dim))) V_minib_next = critic.predict( np.reshape(next_state, (1, robot.state_dim))) if done: td_target = reward td_error = reward - V_minib # not - V_minib[k] ? else: td_target = reward + GAMMA * V_minib_next td_error = reward + GAMMA * V_minib_next - V_minib #critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) actor.train(np.reshape(state, (1, robot.state_dim)), np.reshape(action, (1, 1)), np.reshape(td_error, (1, 1))) state = next_state ep_reward = ep_reward + reward # this print is usefull for debuggin if verbose: print(step, 'action', round(action, 3), 'state', round(robot.state[0], 3), round(robot.state[1], 3), 'r', round(reward, 3)) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') #time.sleep(1) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False): with tf.Session() as sess: # configuring environment env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary # Creating agent ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except: pass # print('********************************') # print('Failed to restore models') # print('********************************') for i in range(epochs): state = env.reset() state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= (epsilon/EXPLORE) epsilon = np.maximum(min_epsilon,epsilon) while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1) action = action_original + max(epsilon,0)*ruido.noise() # remove comment if you want to see a step by step update # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward, done, np.reshape(next_state, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step +=1 if done: ruido.reset() if state[0] > 0.45: #print('****************************************') #print('got it!') #print('****************************************') goal += 1 if max_state_episode > max_state: max_state = max_state_episode print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) ) # print('Efficiency', 100.*((goal)/(i+1.))) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
class DDPG: def __init__(self): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph) def train(self): # Check if the buffer is big enough to start training if self.data_manager.enough_data(): # get the next random batch from the data manger state_batch, \ action_batch, \ reward_batch, \ next_state_batch, \ is_episode_finished_batch = self.data_manager.get_next_batch() state_batch = np.divide(state_batch, 100.0) next_state_batch = np.divide(next_state_batch, 100.0) # Are we visualizing the first state batch for debugging? # If so: We have to scale up the values for grey scale before plotting if self.visualize_input: state_batch_np = np.asarray(state_batch) state_batch_np = np.multiply(state_batch_np, -100.0) state_batch_np = np.add(state_batch_np, 100.0) self.viewer.set_data(state_batch_np) self.viewer.run() self.visualize_input = False # Calculate y for the td_error of the critic y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): if is_episode_finished_batch[i]: y_batch.append([reward_batch[i]]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Now that we have the y batch lets train the critic self.critic_network.train(y_batch, state_batch, action_batch) # Get the action batch so we can calculate the action gradient with it # Then get the action gradient batch and adapt the gradient with the gradient inverting method action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients) q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients) # Now we can train the actor self.actor_network.train(q_gradient_batch, state_batch) # Save model if necessary if self.training_step > 0 and self.training_step % SAVE_STEP == 0: self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step) # Update time step self.training_step += 1 self.data_manager.check_for_enqueue() def get_action(self, state): # normalize the state state = state.astype(float) state = np.divide(state, 100.0) # Get the action self.action = self.actor_network.get_action(state) # Are we using noise? if self.noise_flag: # scale noise down to 0 at training step 3000000 if self.training_step < MAX_NOISE_STEP: self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise() # if action value lies outside of action bounds, rescale the action vector if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]: self.action *= np.fabs(A0_BOUNDS[0]/self.action[0]) if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]: self.action *= np.fabs(A1_BOUNDS[0]/self.action[1]) # Life q value output for this action and state self.print_q_value(state, self.action) return self.action def set_experience(self, state, reward, is_episode_finished): # Make sure we're saving a new old_state for the first experience of every episode if self.first_experience: self.first_experience = False else: self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state, is_episode_finished) # Uncomment if collecting data for the auto_encoder # experience = (self.old_state, self.old_action, reward, state, is_episode_finished) # self.buffer.append(experience) if is_episode_finished: self.first_experience = True self.exploration_noise.reset() # Safe old state and old action for next experience self.old_state = state self.old_action = self.action def print_q_value(self, state, action): string = "-" q_value = self.critic_network.evaluate([state], [action]) stroke_pos = 30 * q_value[0][0] + 30 if stroke_pos < 0: stroke_pos = 0 elif stroke_pos > 60: stroke_pos = 60 print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \ "\tt: ", self.training_step
def __init__(self): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph)
class DrlAgent: def __init__(self, sess, is_train, dim_state, dim_action, num_paths, actor_learn_rate, critic_learn_rate, tau, buffer_size, mini_batch, ep_begin, epsilon_end, gamma, max_epoch, seed=66): self.__is_train = is_train self.__dim_state = dim_state self.__dim_action = dim_action self.__mini_batch = mini_batch self.__ep_begin = ep_begin self.__gamma = gamma self.__max_epoch = max_epoch self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0, actor_learn_rate, tau, num_paths) self.__critic = CriticNetwork(sess, dim_state, dim_action, critic_learn_rate, tau) self.__replay = ReplayBuffer(buffer_size, seed) self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch, dim_action, num_paths, seed) self.__state_curt = np.zeros(dim_state) self.__action_curt = self.__explorer.convert_action( np.ones(dim_action)) self.__episode = 0 self.__step = 0 def target_paras_init(self): self.__actor.update_target_paras() self.__critic.update_target_paras() def predict(self, state, reward): action_original = self.__actor.predict([state])[0] if not self.__is_train: return action_original action = self.__explorer.get_act(action_original) self.__replay.add(self.__state_curt, self.__action_curt, reward, state) self.__state_curt = state self.__action_curt = action if len(self.__replay) > self.__mini_batch: self.train() self.__step += 1 if self.__step >= self.__max_epoch: self.__step = 0 self.__episode += 1 self.__explorer.reset_ep(self.__ep_begin) return action def train(self): batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch( self.__mini_batch) weights = [1.0] * self.__mini_batch weights = np.expand_dims(weights, axis=1) target_q = self.__critic.predict_target( batch_state_next, self.__actor.predict_target(batch_state_next)) value_q = self.__critic.predict(batch_state, batch_action) batch_y = [] batch_error = [] for k in range(len(batch_reward)): target_y = batch_reward[k] + self.__gamma * target_q[k] batch_error.append(abs(target_y - value_q[k])) batch_y.append(target_y) predicted_q, _ = self.__critic.train(batch_state, batch_action, batch_y, weights) a_outs = self.__actor.predict(batch_state) grads = self.__critic.calculate_gradients(batch_state, a_outs) weighted_grads = weights * grads[0] self.__actor.train(batch_state, weighted_grads) self.__actor.update_target_paras() self.__critic.update_target_paras()
signal.signal(signal.SIGINT, signal_handler) #env = gym.make('Pendulum-v0') ob_space = env.observation_space ac_space = env.action_space print("Observation space: ", ob_space, ob_space.dtype) print("Action space: ", ac_space, ac_space.n) s_size = ob_space.shape[0] a_size = ac_space.n print('size: ' + str(s_size) + '/' + str(a_size)) actor = ActorNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=ACTOR_LEARNING_RATE, n_h1=N_H1, n_h2=N_H2, tau=TAU) critic = CriticNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=CRITIC_LEARNING_RATE, n_h1=N_H1, n_h2=N_H2, tau=TAU) noise = OUProcess(ACTION_SIZE) exprep = ExpReplay(mem_size=MEM_SIZE, start_mem=START_MEM, state_size=[STATE_SIZE], kth=-1, batch_size=BATCH_SIZE) sess = tf.Session() with tf.device('/{}:0'.format('CPU')): agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=ACTION_RANGE) sess.run(tf.initialize_all_variables()) for i in range(NUM_EPISODES): cur_state = env.reset() cum_reward = 0 # tensorboard summary summary_writer = tf.summary.FileWriter('/tmp/pendulum-log-0'+'/train', graph=tf.get_default_graph()) if (i % EVALUATE_EVERY) == 0:
def __init__(self): # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.action_bounds = [[0.3, 0.3], [-0.3, -0.3]] self.grad_inv = GradInverter(self.action_bounds) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(os.path.expanduser('~')+'/tensorboard_data') # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(self.session.graph, self.session, BATCH_SIZE) # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) self.critic_network.restore_pretrained_weights(FILTER_LOAD_PATH) self.actor_network.restore_pretrained_weights(FILTER_LOAD_PATH) threads = tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph)
def get_params(game): game_action_size = 0 if game == 'CartPole-v1': game_action_size = 2 elif game == 'Acrobot-v1': game_action_size = 3 elif game == 'MountainCarContinuous-v0': game_action_size = 1 input_dict = { 'env': gym.make(game), 'network_state_size': 6, 'network_action_size': 3, 'game_action_size': game_action_size, 'max_episodes': 5000, 'max_steps': 500, 'discount_factor': 0.99, 'learning_rate_actor': 0.0004, 'learning_rate_value': 0.0004, 'render': False } # Initialize the actor network if game == 'CartPole-v1': input_dict['solve_threshold'] = 475 input_dict['actor'] = ActorNetworkSoftmax( input_dict['network_state_size'], input_dict['network_action_size'], input_dict['learning_rate_actor'], input_dict['game_action_size'], input_dict['env']) input_dict['critic'] = CriticNetwork(input_dict['network_state_size'], input_dict['learning_rate_value'], input_dict['env']) elif game == 'Acrobot-v1': input_dict['solve_threshold'] = -90 input_dict['actor'] = ActorNetworkSoftmax( input_dict['network_state_size'], input_dict['network_action_size'], input_dict['learning_rate_actor'], input_dict['game_action_size'], input_dict['env']) input_dict['critic'] = CriticNetwork(input_dict['network_state_size'], input_dict['learning_rate_value'], input_dict['env']) elif game == 'MountainCarContinuous-v0': input_dict['solve_threshold'] = 90 input_dict['max_steps'] = 5000 input_dict['learning_rate_actor'] = 0.00004 input_dict['actor'] = ActorNetworkRegressor( input_dict['network_state_size'], input_dict['network_action_size'], input_dict['learning_rate_actor'], input_dict['game_action_size'], input_dict['env'], is_scale=True) input_dict['critic'] = CriticNetwork(input_dict['network_state_size'], input_dict['learning_rate_value'], input_dict['env'], is_scale=True) return input_dict
with tf.Session() as sess: env = gym.make('LunarLanderContinuous-v2') env.seed(0) np.random.seed(0) tf.set_random_seed(0) ep = 2000 tau = 0.001 gamma = 0.99 min_batch = 64 actor_lr = 0.00005 critic_lr = 0.0005 buffer_size = 1000000 state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high actor_noise = OUNoise(mu=np.zeros(action_dim)) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, actor_lr, tau, min_batch) critic = CriticNetwork(sess, state_dim, action_dim, critic_lr, tau, gamma, actor.get_num_trainable_vars()) scores = train(sess, env, actor, critic, actor_noise, buffer_size, min_batch, ep) plt.plot([i + 1 for i in range(0, len(scores), 4)], scores[::4]) plt.show()
class DDPG(): def __init__(self, task, sess): self.sess = sess self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.tau = 0.001 self.minibatch_size = 64 self.critic_lr = 0.001 self.gamma = 0.99 self.buffer_size = 1000000 self.random_seed = 1234 self.summary_dir = "/" #self.max_episode = 100 #self.max_episode_len = 100 self.mu = 0 self.actor = ActorNetwork(self.sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.tau, self.minibatch_size) self.critic = CriticNetwork(self.sess, self.state_size, self.action_size, self.critic_lr, self.tau, self.gamma, self.actor.get_num_trainable_vars()) # Initialize replay memory self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed) self.sess.run(tf.global_variables_initializer()) self.actor.update_target_network() self.critic.update_target_network() self.noise = OUNoise(self.action_size, self.mu) self.sess.run(tf.global_variables_initializer()) def reset_episode(self): #self.actor_noise.reset() state = self.env.reset() self.last_state = state self.ep_ave_max_q = 0 self.ep_reward = 0 return state def step(self, s, a, r, terminal, s2): # Save experience / reward #self.memory.add(self.last_state, action, reward, next_state, done) #summary_ops, summary_vars = self.build_summaries() self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )), np.reshape(a, (self.actor.a_dim, )), r, terminal, np.reshape(s2, (self.actor.s_dim, ))) # Learn, if enough samples are available in memory if self.replay_buffer.size() > self.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch) target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() # Roll over last state and action self.last_state = s2 ''' self.ep_reward +=r if terminal: summary_str = self.sess.run( , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)}) writer.add_summary(summary_str, i) #writer.flush() print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \ (self.ep_ave_max_q / float(j)))) ''' def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor.predict(states)[0] #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size)) #print(actions) return actions + self.noise.sample() # add some noise for exploration def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch): target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() def build_summaries(self): episode_reward = tf.Variable(0.) tf.summary.scalar("Reward", episode_reward) episode_ave_max_q = tf.Variable(0.) tf.summary.scalar("Qmax Value", episode_ave_max_q) summary_vars = [episode_reward, episode_ave_max_q] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars
class DDPG: def __init__(self, pretrain=False): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.session = tf.Session(config=config) # self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches # self.visualize_input = VISUALIZE_BUFFER # if self.visualize_input: # self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 662 self.width = 1 self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='float32') self.old_action = np.ones(2, dtype='float32') self.network_action = np.zeros(2, dtype='float32') self.noise_action = np.zeros(2, dtype='float32') self.action = np.zeros(2, dtype='float32') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph) def train(self): # Check if the buffer is big enough to start training if self.data_manager.enough_data(): # start_ = time.time() # get the next random batch from the data manger state_batch, \ action_batch, \ reward_batch, \ next_state_batch, \ is_episode_finished_batch = self.data_manager.get_next_batch() state_batch = np.divide(state_batch, 10.0) next_state_batch = np.divide(next_state_batch, 10.0) # Are we visualizing the first state batch for debugging? # If so: We have to scale up the values for grey scale before plotting # if self.visualize_input: # state_batch_np = np.asarray(state_batch) # state_batch_np = np.multiply(state_batch_np, -100.0) # state_batch_np = np.add(state_batch_np, 100.0) # self.viewer.set_data(state_batch_np) # self.viewer.run() # self.visualize_input = False # Calculate y for the td_error of the critic # start = time.time() y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch, action_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) # done = time.time() # elapsed = done - start # print "forward actor and critic time is: ", elapsed for i in range(0, BATCH_SIZE): if is_episode_finished_batch[i]: y_batch.append([reward_batch[i]]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Now that we have the y batch lets train the critic # start = time.time() self.critic_network.train(y_batch, state_batch, action_batch) # done = time.time() # elapsed = done - start # print "train critic time is: ", elapsed # self.critic_network.train(y_batch, state_batch, action_batch) # Get the action batch so we can calculate the action gradient with it # Then get the action gradient batch and adapt the gradient with the gradient inverting method # start = time.time() action_batch_for_gradients = self.actor_network.evaluate( state_batch, action_batch) # done = time.time() # elapsed = done - start # print "forward action after critic training time is: ", elapsed q_gradient_batch = self.critic_network.get_action_gradient( state_batch, action_batch_for_gradients) q_gradient_batch = self.grad_inv.invert( q_gradient_batch, action_batch_for_gradients) # Now we can train the actor # start = time.time() self.actor_network.train(q_gradient_batch, state_batch, action_batch) # done = time.time() # elapsed = done - start # print "train actor time is: ", elapsed # done = time.time() # elapsed = done - start_ # print "====== total time is: ", elapsed # Save model if necessary if self.training_step > 0 and self.training_step % SAVE_STEP == 0: self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step) # Update time step self.training_step += 1 if self.training_step % 400 == 0: print "iter: ", self.training_step # start_ = time.time() self.data_manager.check_for_enqueue() # done = time.time() # elapsed = done - start_ # print "############ check enqueue time is: ", elapsed def get_action(self, state, old_action): # normalize the state state = state.astype(float) state = np.divide(state, 10.0) # Get the action self.action = self.actor_network.get_action(state, old_action) self.action = self.action.reshape((2, )) # Are we using noise? if self.noise_flag: # scale noise down to 0 at training step 3000000 self.action = 0.8 * self.exploration_noise.noise() # if self.training_step < MAX_NOISE_STEP: # self.action += (MAX_NOISE_STEP - self.training_step) / \ # MAX_NOISE_STEP * self.exploration_noise.noise() # if action value lies outside of action bounds, rescale the action vector # if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]: # self.action *= np.fabs(A0_BOUNDS[0] / self.action[0]) # if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]: # self.action *= np.fabs(A1_BOUNDS[0] / self.action[1]) # Life q value output for this action and state self.print_q_value(state, self.action) return self.action def set_experience(self, state, reward, is_episode_finished): # Make sure we're saving a new old_state for the first experience of every episode if self.first_experience: self.first_experience = False else: state.astype('float32') self.old_action.astype('float32') self.old_action.astype('float32') self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state, is_episode_finished) # Uncomment if collecting data for the auto_encoder # experience = (self.old_state, self.old_action, reward, state, is_episode_finished) # self.buffer.append(experience) if is_episode_finished: self.first_experience = True self.exploration_noise.reset() # Safe old state and old action for next experience self.old_state = state self.old_action = self.action def print_q_value(self, state, action): string = "-" q_value = self.critic_network.evaluate([state], [action]) stroke_pos = 30 * q_value[0][0] + 30 if stroke_pos < 0: stroke_pos = 0 elif stroke_pos > 60: stroke_pos = 60
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.Session() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.sess.run(tf.global_variables_initializer()) #target_param <- eval_param self.actor_network.update_target() self.critic_network.update_target() def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.sample(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.size > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def __init__(self, pretrain=False): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.session = tf.Session(config=config) # self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches # self.visualize_input = VISUALIZE_BUFFER # if self.visualize_input: # self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 662 self.width = 1 self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='float32') self.old_action = np.ones(2, dtype='float32') self.network_action = np.zeros(2, dtype='float32') self.noise_action = np.zeros(2, dtype='float32') self.action = np.zeros(2, dtype='float32') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph)