def __init__(self, input_dim, output_dim, lr, gamma, seed_num=False): self.input_dim = input_dim self.output_dim = output_dim self.actions = range(output_dim) self.lr = lr self.gamma = gamma self.tau = 0.1 self.seed_num = seed_num #For experience replay self.memory = [] self.memory_size = 10000 self.batchsize = 32 #Actor & critic self.actor = Actor(input_dim, output_dim, self.lr) self.critic = Critic(input_dim, output_dim, self.lr, self.gamma) if seed_num != False: set_random_seed(seed_num) #seed tensorflow seed(seed_num) #seed numpy
def __init__(self, task): self.task = task # For quadcopter task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf self.best_w_actor = None self.best_w_critic = None self.score = 0
class Agent: def __init__(self, state_size, batch_size, is_eval = False): self.state_size = state_size # self.action_size = 3 self.buffer_size = 1000000 self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.inventory = [] self.is_eval = is_eval self.gamma = 0.99 self.tau = 0.001 self.actor_local = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) def act(self, state): options = self.actor_local.model.predict(state) self.last_state = state if not self.is_eval: return choice(range(3), p = options[0]) return np.argmax(options[0]) def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state,done) if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) self.last_state = next_state def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x = [states, actions], y = Q_targets) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),(-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1)
def __init__(self, state_space_dim, action_space_dim, min_action_val, max_action_val, hidden_layer_size=512, gamma=0.99, tau=0.0001, path_to_load=None): self.gamma = gamma self.tau = tau self.min_action_val = min_action_val self.max_action_val = max_action_val self.buffer = Buffer(state_space_dim, action_space_dim) self.noise_generator = GaussianNoise(0., 0.2, action_space_dim) self.actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) if path_to_load is not None: if os.path.exists(path_to_load + "_actor.h5") and \ os.path.exists(path_to_load + "_critic.h5"): self.load(path_to_load) self.target_actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.target_critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic.model.set_weights(self.critic.model.get_weights()) critic_lr = 0.002 actor_lr = 0.001 self.critic_optimizer = tf.keras.optimizers.Adam(critic_lr) self.actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
def __init__(self, input_dim, output_dim, lr, gamma): self.input_dim = input_dim self.output_dim = output_dim self.actions = range(output_dim) self.lr = lr self.gamma = gamma self.route = [] self.state = 'searching' self.idle_time = 0 self.active_time = 0 #These will store the samples from which the agent will learn self.states = [] self.action_samples = [] self.rewards = [] #Make actor and critic self.actor = Actor(input_dim, output_dim, self.lr) self.critic = Critic(input_dim, output_dim, self.lr) self.train_actor = self.actor.optimizer() self.train_critic = self.critic.optimizer()
def __init__(self, gamma=0.99): self.gamma = gamma # self.a_opt = tf.keras.optimizers.Adam(learning_rate=1e-5) # self.c_opt = tf.keras.optimizers.Adam(learning_rate=1e-5) self.a_opt = tf.keras.optimizers.Adam(learning_rate=7e-3) self.c_opt = tf.keras.optimizers.Adam(learning_rate=7e-3) self.f1 = tf.keras.layers.Dense(22, activation='relu') self.f2 = tf.keras.layers.Dense(22, activation='relu') self.sigma = tf.keras.layers.Dense(1, activation=None) self.mu = tf.keras.layers.Dense(1, activation=None) self.actor = Actor() self.critic = Critic() self.clip_pram = 0.2
def __init__(self, state_size, batch_size, is_eval=False): self.state_size = state_size self.action_size = 3 self.buffer_size = 1000000 self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.inventory = [] self.is_eval = is_eval self.gamma = 0.99 self.tau = 0.001 self.actor_local = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights())
def __init__(self, input_dim, output_dim, lr, gamma, tau, clipnorm, verbose): self.input_dim = input_dim self.output_dim = output_dim self.actions = range(output_dim) self.lr = lr self.gamma = gamma self.tau = tau #Buffer for experience replay self.S = [] self.A = [] self.R = [] self.S1 = [] self.D = [] self.memory_size = 10**3 #Make actor and critic self.actor = Actor(input_dim, output_dim, lr, gamma, tau, clipnorm, verbose) self.critic = Critic(input_dim, output_dim, lr, gamma, tau, clipnorm, verbose)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.01 self.exploration_theta = 0.2 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 10000000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor self.tau = 0.1 # for soft update of target parameters
def __init__(self, input_dim, output_dim, tau=0.001, gamma=0.99, train_batch_size=640): self.input_dim = input_dim self.output_dim = output_dim self.tau = tau self.gamma = gamma self.train_batch_size = train_batch_size self.main_critic = Critic(input_dim, output_dim, tau, gamma) self.target_critic = Critic(input_dim, output_dim, tau, gamma) self.main_actor = Actor(input_dim, output_dim, tau, gamma) self.target_actor = Actor(input_dim, output_dim, tau, gamma) self.target_critic.model.set_weights( self.main_critic.model.get_weights()) self.target_actor.model.set_weights( self.main_actor.model.get_weights()) self.memory = ReplayBuffer(batch_size=train_batch_size)
def train(show_baseline=False, continue_train=False, \ model_save_path='best_model', learn_freq= 5, memory_size = 20000, \ memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \ gamma = 0.9, alpha = 0.9, max_episode=1000, ): evaluate_env_list_path = 'env_list_set1' if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim_1 = 45 request_dim = 17 obs_dim_2 = 10 obs_dim = obs_dim_1 + obs_dim_2 * 7 encoder = Encoder(input_size=request_dim, output_size=obs_dim_2, \ use_rnn=False, use_gru=True, use_lstm=False) rpm = ReplayMemory(memory_size) # DQN的经验回放池 critic = Critic(obs_dim=obs_dim, action_dim=action_dim, encoder=encoder) agent = Agent(critic=critic, obs_dim=obs_dim, action_dim=action_dim, lr=learning_rate, gamma=gamma, alpha=alpha) if continue_train: agent.load(model_save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < memory_warmup_size: run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 100): total_reward = run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) episode += 1 # for parameter in critic.parameters(): # print(parameter) # break # test part # print(critic.parameters()) eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward)) agent.save(model_save_path)
def _build_graph(self): self.actor = Actor() self.critic = Critic() if self.mode == tf.estimator.ModeKeys.TRAIN: ave_ep_reward = tf.placeholder(tf.float32, name='ave_ep_reward') tf.summary.scalar('ave_ep_reward', ave_ep_reward) self.loss = ave_ep_reward global_step = tf.train.get_global_step() self.train_op = tf.assign_add(global_step, 1) self.training_hooks = [TrainingHook(self)] else: self.loss = tf.constant(1) self.evaluation_hooks = [EvalHook(self)]
def __init__(self, state_space, action_space, max_action, device): self.state_size = state_space.shape[0] self.action_size = action_space.shape[0] self.max_action = max_action self.device = device self.actor_local = Actor(state_space.shape, action_space.high.size, max_action) self.actor_target = Actor(state_space.shape, action_space.high.size, max_action) self.actor_optimizer = optimizers.Adam(LR_ACTOR) # let target be equal to local self.actor_target.set_weights(self.actor_local.get_weights()) self.critic_local = Critic(state_space.shape, action_space.high.size) self.critic_target = Critic(state_space.shape, action_space.high.size) self.critic_optimizer = optimizers.Adam(LR_CRITIC) # let target be equal to local self.critic_target.set_weights(self.critic_local.get_weights()) self.noise = OUNoise(self.action_size) self.memory = ReplayBuffer(BUFFER_SIZE) self.current_steps = 0
def __init__(self, act_dim, env_dim, act_range, k, buffer_size=10000, gamma=0.99, lr=0.001, tau=0.001): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = (1, ) + (13, ) self.gamma = gamma # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) # self.buffer = MemoryBuffer(buffer_size) self.buffer = deque(maxlen=buffer_size) self.count = 0 self.buffer_size = buffer_size
def __init__(self, state_size, action_size, config, seed): """Initialize a DDPG agent Params ====== state_size (int): dimension of each state action_size (int): dimension of each action ddpg_config (config): configuration of DDPG seed (int): random seed """ self.gamma = config.gamma self.tau = config.tau self.seed = np.random.seed(seed) # actor networks self.actor_local = Actor(state_size, action_size, config.units_actor, seed).to(device) self.actor_target = Actor(state_size, action_size, config.units_actor, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), config.lr_actor) # critic newtworks self.critic_local = Critic(state_size, action_size, config.units_critic, seed).to(device) self.critic_target = Critic(state_size, action_size, config.units_critic, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), config.lr_critic) # Noise process self.noise = OUNoise(action_size, seed, config.mu, config.theta, config.sigma) # Replay Buffer self.memory = ReplayBuffer(config.buffer_size, config.batch_size, seed)
def __init__(self, num_agents=8, env_name='LunarLanderContinuous-v2', network='mlp', num_steps=32): # set up environment, observation memory self.num_agents = num_agents self.num_steps = num_steps self.network = network temp_env = gym.make(env_name) self.obs_space_size = temp_env.observation_space.shape[0] self.memory = None # Initialize model, loss and optimizer self.actor = Actor(temp_env, network) self.critic = Critic() self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001) self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) self.mse = tf.keras.losses.MeanSquaredError() self.actor_loss = None self.critic_loss = None # instantiate variable to store recurrent states of the agents self.agents_recurrent_state = None self.update_recurrent_state = None # store action distribution during update self.action_dist = None # Set up checkpoint paths self.checkpoint_directory_a = f"./training_checkpoints/{self.network}/actor" self.checkpoint_directory_c = f"./training_checkpoints/{self.network}/critic" # instantiate multiple agents (ray actors) and set first one as chief self.agent_list = [ A2CAgent.remote(self.num_steps, env_name) for _ in range(num_agents) ] self.agent_list[0].set_chief.remote() # Prepare Tensorboard current_time = datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/gradient_tape/' + current_time + '/train' self.train_summary_writer = tf.summary.create_file_writer( train_log_dir) self.step = 0
def test_get_gradient(self): image_shape = (256, 1, 28, 28) device = 'cuda' z_dim = 64 gen = Generator(z_dim).to(device) crit = Critic().to(device) real = torch.randn(*image_shape, device=device) + 1 fake = torch.randn(*image_shape, device=device) - 1 epsilon_shape = [1 for _ in image_shape] epsilon_shape[0] = image_shape[0] epsilon = torch.rand(epsilon_shape, device=device).requires_grad_() gradient = get_gradient(crit, real, fake, epsilon) self.assertEqual(tuple(gradient.shape), image_shape) self.assertGreater(gradient.max(), 0) self.assertLess(gradient.min(), 0)
def __init__(self, state_dim, action_dim, final_activation=tf.nn.tanh, action_bound=0.4, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): self.ID = random_string(10) self.state_dim = state_dim self.action_dim = action_dim self.final_activation = final_activation self.action_bound = action_bound self.GAMMA = GAMMA self.lr = lr self.replay_buffer_size = replay_buffer_size self.replay_buffer = ReplayBuffer(replay_buffer_size) self.training_batch_size = training_batch_size with tf.variable_scope(self.ID) as scope: self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.lr, self.final_activation) self.critic = Critic(self.state_dim, self.action_dim, self.lr)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, max_action, minibatch_size, a_lr, c_lr, gamma, tau): self.state_size = state_size self.action_size = action_size self.max_action = max_action self.critic_lr = c_lr self.actor_lr = a_lr self.actor_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.actor_target_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.critic_network = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target_network = Critic(self.state_size, self.action_size, self.critic_lr) self.actor_target_network.set_weights(self.actor_network.get_weights()) self.critic_target_network.set_weights(self.critic_network.get_weights()) self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr) self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr) self.replay_buffer = ReplayBuffer(100000) self.MINIBATCH_SIZE = minibatch_size self.GAMMA = tf.cast(gamma, dtype=tf.float64) self.TAU = tau self.noise = OUNoise(self.action_size)
def main(): with tf.Session() as sess: env = gym.make('Humanoid-v1') np.random.seed(123) env.seed(123) tf.set_random_seed(123) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] action_bounds = env.action_space.high actor = Actor(sess, state_size, action_size, action_bounds, actor_learning_rate, temperature) critic = Critic(sess, state_size, action_size, critic_learning_rate, temperature, actor.get_trainable_vars()) train(sess, env, actor, critic)
def train(show_baseline=False, continue_train=False, \ model_save_path='best_model', learn_freq= 5, memory_size = 20000, \ memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \ gamma = 0.9, alpha = 0.9, max_episode=1000, ): evaluate_env_list_path = 'env_list_set1' if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim = 45 rpm = ReplayMemory(memory_size) # DQN的经验回放池 critic = Critic(obs_dim=obs_dim, action_dim=action_dim) agent = Agent(critic=critic, obs_dim=obs_dim, action_dim=action_dim, lr=learning_rate, gamma=gamma, alpha=alpha) if continue_train: agent.load(model_save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < memory_warmup_size: run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 10): total_reward = run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) episode += 1 # for name, param in critic.state_dict().items(): # # name: str # # param: Tensor # print(param) # test part eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward)) agent.save(model_save_path)
def main(): with tf.Session() as sess: env = gym.make('Pendulum-v0') np.random.seed(random_seed) tf.set_random_seed(random_seed) env.seed(random_seed) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] action_bound_high = env.action_space.high action_bound_low = env.action_space.low if action_bound_high == -action_bound_low: action_bounds = action_bound_high actor = Actor(sess, state_size, action_size, action_bounds, actor_learning_rate, temperature) critic = Critic(sess, state_size, action_size, critic_learning_rate, temperature, actor.get_trainable_vars()) train(sess, env, actor, critic)
def __init__(self, params, tasks, layers, icm_layers, build_graph=GRAPH_PATH): self.params = params self.curr_length = 1 self.tasks = tasks actor_layers, critic_layers = layers self.actor_layers = actor_layers self.critic_layers = critic_layers self.icm_layers = icm_layers self.encoder = Encoder(params) self.icm = ICM(params, self.icm_layers, self.encoder) self.subpolicy = Subpolicy(params, self.actor_layers, self.encoder) self.critic = Critic(params, self.critic_layers, self.encoder) self.taskpolicy = Taskpolicy(self.subpolicy, self.critic, self.icm, params) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) tf.get_default_graph().finalize() if build_graph != None: self.writer = tf.summary.FileWriter(build_graph, self.session.graph) filename = 'PARAMS/' + str(self.params['iteration']) + '.txt' if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w') as file: for kw in self.params: arg = self.params[kw] if type(arg) == "function": file.write(str(kw) + ' ' + arg.__name__) else: file.write(str(kw) + ' ' + str(arg)) file.write("\n")
class Agent: def __init__(self, state_size, action_size, buffer_size, minibatch_size, action_high, action_low): sess = tf.Session() self.actor = Actor(sess, state_size, action_size, action_high, action_low) self.critic = Critic(sess, state_size, action_size) self.noiser = OUnoise(action_size, action_high, action_low) self.buffer = Buffer(buffer_size) self.minibatch_size = minibatch_size self.action_high = action_high self.action_low = action_low self.training = False sess.run(tf.global_variables_initializer()) def choose_action(self, state): state = np.array([state]) action = self.actor.action(state)[0] action = action + self.noiser._noise() #clip for i, (high, low) in enumerate(zip(self.action_high, self.action_low)): if action[i] > high: action[i] = high elif action[i] < low: action[i] = low return action def train(self, transition): self.buffer.store(transition) if not self.training and len( self.buffer.transitions) == self.minibatch_size: self.training = True if self.training: minibatch = np.array( random.sample(self.buffer.transitions, self.minibatch_size)) state_batch = np.vstack(minibatch[:, 0]) action_batch = np.vstack(minibatch[:, 1]) next_state_batch = np.vstack(minibatch[:, 2]) reward_batch = np.vstack(minibatch[:, 3]) done_batch = np.vstack(minibatch[:, 4]) next_action_batch = self.actor.next_action(next_state_batch) q_target = reward_batch + ( 1 - done_batch) * gamma * self.critic.next_q_value( next_state_batch, next_action_batch) self.critic.train(state_batch, action_batch, q_target) action_grad_batch = self.critic.action_grad( state_batch, self.actor.action(state_batch)) self.actor.train(state_batch, action_grad_batch) self.critic.update_target() self.actor.update_target()
def __init__(self,env_name, num_threads, gamma= 0.99,actor_learning_rate = 0.001, actor_batch_size = 64,critic_learning_rate = 0.01,\ entropy_beta = 0.01,critic_batch_size = 16,critic_epochs = 100, max_episodes_per_thread = 100, episode_to_train= 4): self.envs = [gym.make(env_name).env for _ in range(num_threads)] if self.envs[0].observation_space.shape == (): input_shape = 1 else: input_shape = self.envs[0].observation_space.shape[0] self.actor = Actor(actor_learning_rate, actor_batch_size, input_shape, self.envs[0].action_space.n, entropy_beta) self.critic = Critic(critic_learning_rate, critic_batch_size, critic_epochs, input_shape, 1) batch = Batch(self.actor, self.critic, batch_size=actor_batch_size) lock = Lock() self.threads = [ Env_thread("thread" + str(i), lock, batch, self.envs[i], self.actor, self.critic, gamma, max_episodes_per_thread, episode_to_train) for i in range(num_threads) ]
def _build_update_op(self): global_step = tf.train.get_global_step() tf.assign_add(global_step, 1, name='global_step_add') # with tf.variable_scope('eval_net'): # self.eval_critic = Critic(self.eval_actor.actions) # # self.eval_actor.build_train_op(self.eval_critic.qa_value) with tf.variable_scope('target_net'): self.target_actor = Actor() self.target_critic = Critic(self.target_actor.actions) actor_update_op = [ tf.assign( target_param, target_param * (1 - Config.train.TAU) + train_param * Config.train.TAU) for train_param, target_param in zip(self.eval_actor.params, self.target_actor.params) ] critic_update_op = [ tf.assign( target_param, target_param * (1 - Config.train.TAU) + train_param * Config.train.TAU) for train_param, target_param in zip(self.eval_critic.params, self.target_critic.params) ] actor_init_op = [ tf.assign(target_param, train_param) for train_param, target_param in zip( self.eval_actor.params, self.target_actor.params) ] critic_init_op = [ tf.assign(target_param, train_param) for train_param, target_param in zip( self.eval_critic.params, self.target_critic.params) ] self.update_target_op = tf.group(actor_update_op + critic_update_op) self.init_target_op = tf.group(actor_init_op + critic_init_op)
def play_game(self): sess = tf.Session() # init states, add neigh-dim # order in states is important states = collections.defaultdict(list) for t in self.generator.Ts: for t_customer in t.customers_C: states[t.id].append(100) for t_customer in t.customers_CP: states[t.id].append(100) for t_customer in t.customers_M: states[t.id].append(100) for t_peer in t.peers_T: states[t.id].append(100) # reachable end-to-end throughput (all advertised are considered here) for destination in t.table: states[t.id].append(0) for destination in t.table: states[t.id].append(0) # create AC-model, define action set for i in self.Ns: # node i n_features = len(states[i.id]) actor = Actor(sess, n_features, i.n_actions, i.id) critic = Critic(sess, n_features, i.id) i.set_rl_setting(actor, critic) sess.run(tf.global_variables_initializer()) ''' loop time as time epoch ''' for t in self.MAX: # TODO, generate TF, think flow that is not reachable?, or all destinations are reachable # every node takes the actions actions = [] for i in self.Ns: # node i s = np.array(states[i.id]) actions.append(i.actor.choose_action(s))
def __init__(self,input_dim, output_dim, lr, gamma, loss_clipping, c1): self.input_dim = input_dim self.output_dim = output_dim self.actions = range(output_dim) self.lr = lr self.gamma = gamma self.loss_clipping = loss_clipping #for actor loss function self.c1 = c1 #weight for entropy term in actor loss function self.num_epochs = 10 self.batchsize = 10 #These will store the samples from which the agent will learn self.states = [] self.actions = [] self.pi_vecs = [] self.rewards = [] #Make actor and critic self.actor = Actor(input_dim,output_dim,lr,gamma,loss_clipping,c1) self.critic = Critic(input_dim,output_dim, self.lr)