def __init__(self, state_size, action_size, max_action, minibatch_size, a_lr, c_lr, gamma, tau): self.state_size = state_size self.action_size = action_size self.max_action = max_action self.critic_lr = c_lr self.actor_lr = a_lr self.actor_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.actor_target_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.critic_network = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target_network = Critic(self.state_size, self.action_size, self.critic_lr) self.actor_target_network.set_weights(self.actor_network.get_weights()) self.critic_target_network.set_weights( self.critic_network.get_weights()) self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr) self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr) self.replay_buffer = ReplayBuffer(1e6) self.MINIBATCH_SIZE = minibatch_size self.GAMMA = tf.cast(gamma, dtype=tf.float64) self.TAU = tau self.noise = OUNoise(self.action_size)
def __init__(self, agent_id, state_size, action_size, rand_seed, meta_agent): """ Creates a new DDPG Agent """ self.agent_id = agent_id self.action_size = action_size # Defines the Actor Networks self.actor_local = Actor(state_size, action_size, rand_seed).to(device) self.actor_target = Actor(state_size, action_size, rand_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Defines the Critic Networks self.critic_local = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_target = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=LR_CRITIC) #, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, rand_seed) # Refers to the MA agent memory self.memory = meta_agent.memory self.t_step = 0
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters
def test(env, trained_model): actor_net = NCS_nn.NCS_net(48, 4, 0.8).to(device) model = torch.load(trained_model) actor_net.load_state_dict(model) actor_net.eval() # IF YOU WAN TO START AT RANDOM INTERMEDIATE STATE # file_name = open("data_cube_5_10_07_19_1612.pkl", "rb") # data = pickle.load(file_name) # states = np.array(data["states"]) # random_states_index = np.random.randint(0, len(states), size = len(states)) noise = OUNoise(4) expl_noise = OUNoise(4, sigma=0.001) for _ in range(10): # inference obs, done = env.reset(), False # obs = env.env.intermediate_state_reset(states[np.random.choice(random_states_index, 1)[0]]) print("start") # while not done: for _ in range(150): obs = torch.FloatTensor(np.array(obs).reshape(1, -1)).to( device) # + expl_noise.noise() action = actor_net(obs).cpu().data.numpy().flatten() print(action) obs, reward, done, _ = env.step(action)
def __init__(self, state_size, action_size, num_agents): """Initialize an Agent object. Params ====== state_size (int): dimension of each state for each agent action_size (int): dimension of each action for each agent """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(DEVICE) self.actor_target = Actor(state_size, action_size).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_target = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY_critic) # Noise process self.noise = OUNoise(action_size) #single agent only self.noise_scale = NOISE_START # Make sure target is initialized with the same weight as the source (makes a big difference) self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local)
def __init__(self, C, b, x, action_output_num, actor_size, replay_size=1000000, ou_noise=True, param_noise=True, noise_scale=0.3, final_noise_scale=0.3): self.C = C self.b = b self.x = x self.hd = action_output_num self.actor_size = actor_size self.memory = ReplayMemory(replay_size) self.new_b = None self.env = None self.agent = None self.ou_noise = ou_noise self.noise_scale = noise_scale self.final_noise_scale = final_noise_scale self.ounoise = OUNoise(action_output_num) if ou_noise else None self.param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=noise_scale, adaptation_coefficient=1.05) if param_noise else None
def __init__(self, agent_id, model, action_size=2, seed=0): """Initialize an Agent object. """ self.seed = random.seed(seed) self.id = agent_id self.action_size = action_size # Actor Network self.actor_local = model.actor_local self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network self.critic_local = model.critic_local self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Set weights for local and target actor, respectively, critic the same self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, seed)
def __init__(self, task): # Hyper parameters self.learning_rate_actor = 1e-4 self.learning_rate_critic = 1e-3 self.gamma = 0.99 self.tau = 0.001 # Define net self.sess = tf.Session() self.task = task self.actor = ActorNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_actor, \ self.task.action_low, self.task.action_high, self.tau) self.critic = CriticNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_critic, self.tau) # Define noise self.mu = 0 self.theta = 0.15 self.sigma = 0.20 self.noise = OUNoise(self.task.action_size, self.mu, self.theta, self.sigma) # Define memory replay self.buffer_size = 1000000 self.batch_size = 64 self.memory = Replay(self.buffer_size, self.batch_size) # Score self.best_score = -np.inf self.best_reward = -np.inf
def __init__(self, config, state_size, action_size): super(DDPGAgent, self).__init__() l1 = config['network']['hidden'] l2 = int(config['network']['hidden'] / 2) self.actor = Actor(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.critic = Critic(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.target_actor = Actor(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.target_critic = Critic(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.noise = OUNoise(action_size, mu=config['noise']['mu'], sigma=config['noise']['sigma'], theta=config['noise']['theta']) # initialize targets same as original networks self.hard_update(self.target_actor, self.actor) self.hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=config['LR_ACTOR']) self.critic_optimizer = Adam(self.critic.parameters(), lr=config['LR_CRITIC'])
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__( self, num_agents, state_size, action_size, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, random_seed=2, ): """Initialize an Agent object. Params ====== num_agents (int): number of agents state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer( action_size=action_size, buffer_size=buffer_size, batch_size=batch_size, seed=random_seed, )
def __init__(self, state_size, action_size, num_agents, device, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, random_seed=0): """ Initialize an Agent object. :param state_size: size of state :param action_size: size of action :param num_agents: number of agents :param gamma: discount factor :param tau: factor for soft update of target parameters :param lr_actor: Learning rate of actor :param lr_critic: Learning rate of critic :param random_seed: Random seed :param device: cuda or cpu """ self.device = device self.gamma = gamma self.tau = tau self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.full_state_size = state_size * num_agents self.full_action_size = action_size * num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, device, random_seed).to(device) self.actor_target = Actor(state_size, action_size, device, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_target = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0) self.noise = OUNoise(action_size, random_seed)
def __init__(self, device, state_size, n_agents, action_size, random_seed, \ buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, \ learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'): # Set Computational device self.DEVICE = device # Init State, action and agent dimensions self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) self.l_step = 0 self.log_interval = 200 # Init Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num # Init Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Init Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Init Noise Process self.noise = OUNoise((n_agents, action_size), random_seed, mu=0., theta=ou_theta, sigma=ou_sigma) # Init Replay Memory self.memory = ReplayBuffer(device, action_size, buffer_size, batch_size, random_seed)
def test_random_action(): env = gym.make('gym_kinova_gripper:kinovagripper-v0') obs, done = env.reset(), False noise = OUNoise(3) max_action = float(env.action_space.high[0]) correct = 0 noise.reset() cum_reward = 0.0 for i in range(100): finger_actions = noise.noise().clip(-max_action, max_action) # actions = np.array([0.0, finger_actions[0], finger_actions[1], finger_actions[2]]) actions = np.array([0.4, 0.5, 0.5, 0.5]) obs, reward, done, _ = env.step(actions) inputs = torch.FloatTensor(np.array(obs)).to(device)
def __init__(self, state_size, action_size, num_agents, random_seed=0, params=params): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.params = params # Actor (Policy) Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_target = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.params['LR_ACTOR']) # Critic (Value) Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_target = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.params['LR_CRITIC'], weight_decay=self.params['WEIGHT_DECAY']) # Initialize target and local to same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'], self.params['BATCH_SIZE'], random_seed)
def main(): ddpg = DDPG(GAMMA, TAU, torch.cuda.is_available()) memory = ReplayMemory(REPLAY_SIZE) env.init_state() if os.path.exists('models/ddpg_actor_'): ddpg.load_model() updates = 0 for i_episode in range(NUM_EPISODES): while True: ounoise = OUNoise(1, scale=NOISE_SCALE - NOISE_SCALE // NUM_EPISODES * i_episode) action = ddpg.select_action(env.state, ounoise) transition = env.step(action) memory.push(transition) if len(memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): transitions = memory.sample(BATCH_SIZE) random.shuffle(transitions) batch = Transition(*zip(*transitions)) value_loss, policy_loss = ddpg.update_parameters(batch) print( "Episode: {}, Updates: {}, Value Loss: {}, Policy Loss: {}" .format(i_episode, updates, value_loss, policy_loss)) updates += 1 break if (i_episode + 1) % 100 == 0: ddpg.save_model()
def __init__(self, env): self.env = env self.state_size = self.env.observation_space.shape[0] self.action_size = self.env.action_space.shape[0] self.action_low = self.env.action_space.low[0] self.action_high = self.env.action_space.high[0] # Learning rates self.actor_learning_rate = 1e-4 self.critic_learning_rate = 1e-3 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.1 self.exploration_sigma = 0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters
def __init__(self, task): self.task = task self.session = K.get_session() init = tf.global_variables_initializer() self.session.run(init) self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.score = -math.inf self.best_score = -math.inf self.last_loss = math.inf # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.noise_scale = (self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # actor policy model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # critic value model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.25 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.9 # discount rate self.tau = 0.1 # soft update parameter self.total_reward = 0 self.count = 0 self.score = 0 self.best_score = -np.inf self.reset_episode()
def __init__(self, state_size, action_size, random_seed): """ Creates a new DDPG agent initilizing the networks """ self.state_size = state_size self.action_size = action_size self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.critic = Critic(state_size, action_size, 17).to(device) self.critic_target = Critic(state_size, action_size, 17).to(device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.actor = Actor(state_size, action_size, 17).to(device) self.actor_target = Actor(state_size, action_size, 17).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) self.seed = random.seed(random_seed) # Noise process self.noise = OUNoise(action_size, random_seed)
def main(): my_env = env() agent = NAF_CNN(0.99, 0.001, 128, my_env.observation_space.shape[0], my_env.action_space) parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') parser.add_argument('--noise_scale', type=float, default=0.3, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.3)') parser.add_argument('--exploration_end', type=int, default=100, metavar='N', help='number of episodes with noise (default: 100)') args = parser.parse_args() ounoise = OUNoise(my_env.action_space.shape[0]) ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - 1) / args.exploration_end + args.final_noise_scale ounoise.reset() state = my_env.reset() i = 10 while i > 0: action = agent.select_action(state, ounoise) print("action: {}".format(action)) next_state, reward, done = my_env.step(action) if done: break print(reward) i = i - 1
def __init__(self, task, sess): self.sess = sess self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.tau = 0.001 self.minibatch_size = 64 self.critic_lr = 0.001 self.gamma = 0.99 self.buffer_size = 1000000 self.random_seed = 1234 self.summary_dir = "/" #self.max_episode = 100 #self.max_episode_len = 100 self.mu = 0 self.actor = ActorNetwork(self.sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.tau, self.minibatch_size) self.critic = CriticNetwork(self.sess, self.state_size, self.action_size, self.critic_lr, self.tau, self.gamma, self.actor.get_num_trainable_vars()) # Initialize replay memory self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed) self.sess.run(tf.global_variables_initializer()) self.actor.update_target_network() self.critic.update_target_network() self.noise = OUNoise(self.action_size, self.mu) self.sess.run(tf.global_variables_initializer())
def __init__(self, state_space, action_space, max_action, device): self.state_size = state_space.shape[0] self.action_size = action_space.shape[0] self.max_action = max_action self.device = device self.actor_local = Actor(state_space.shape, action_space.high.size, max_action) self.actor_target = Actor(state_space.shape, action_space.high.size, max_action) self.actor_optimizer = optimizers.Adam(LR_ACTOR) # let target be equal to local self.actor_target.set_weights(self.actor_local.get_weights()) self.critic_local = Critic(state_space.shape, action_space.high.size) self.critic_target = Critic(state_space.shape, action_space.high.size) self.critic_optimizer = optimizers.Adam(LR_CRITIC) # let target be equal to local self.critic_target.set_weights(self.critic_local.get_weights()) self.noise = OUNoise(self.action_size) self.memory = ReplayBuffer(BUFFER_SIZE) self.current_steps = 0
def __init__(self, args, env): self.args = args self.env = env # get the number of inputs... num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] self.action_scale = self.env.action_space.high[0] # build up the network self.actor_net = Actor(num_inputs, num_actions) self.critic_net = Critic(num_inputs, num_actions) # get the target network... self.actor_target_net = Actor(num_inputs, num_actions) self.critic_target_net = Critic(num_inputs, num_actions) if self.args.cuda: self.actor_net.cuda() self.critic_net.cuda() self.actor_target_net.cuda() self.critic_target_net.cuda() # copy the parameters.. self.actor_target_net.load_state_dict(self.actor_net.state_dict()) self.critic_target_net.load_state_dict(self.critic_net.state_dict()) # setup the optimizer... self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), lr=self.args.actor_lr) self.optimizer_critic = torch.optim.Adam( self.critic_net.parameters(), lr=self.args.critic_lr, weight_decay=self.args.critic_l2_reg) # setting up the noise self.ou_noise = OUNoise(num_actions) # check some dir if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) self.model_path = self.args.save_dir + self.args.env_name + '/' if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, fd, cfg, memory, explore=True): threading.Thread.__init__(self) self.fd = fd self.cfg = cfg self.memory = memory self.explore = explore self.agent = torch.load(cfg.get('nafcnn', 'agent')) self.ounoise = OUNoise(action_dimension=1) mpsched.persist_state(fd) self.env = Env(fd=self.fd, time=self.cfg.getfloat('env', 'time'), k=self.cfg.getint('env', 'k'), alpha=self.cfg.getfloat('env', 'alpha'), b=self.cfg.getfloat('env', 'b'), c=self.cfg.getfloat('env', 'c'))
def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size)
class DDPG(): def __init__(self, task, sess): self.sess = sess self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.tau = 0.001 self.minibatch_size = 64 self.critic_lr = 0.001 self.gamma = 0.99 self.buffer_size = 1000000 self.random_seed = 1234 self.summary_dir = "/" #self.max_episode = 100 #self.max_episode_len = 100 self.mu = 0 self.actor = ActorNetwork(self.sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.tau, self.minibatch_size) self.critic = CriticNetwork(self.sess, self.state_size, self.action_size, self.critic_lr, self.tau, self.gamma, self.actor.get_num_trainable_vars()) # Initialize replay memory self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed) self.sess.run(tf.global_variables_initializer()) self.actor.update_target_network() self.critic.update_target_network() self.noise = OUNoise(self.action_size, self.mu) self.sess.run(tf.global_variables_initializer()) def reset_episode(self): #self.actor_noise.reset() state = self.env.reset() self.last_state = state self.ep_ave_max_q = 0 self.ep_reward = 0 return state def step(self, s, a, r, terminal, s2): # Save experience / reward #self.memory.add(self.last_state, action, reward, next_state, done) #summary_ops, summary_vars = self.build_summaries() self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )), np.reshape(a, (self.actor.a_dim, )), r, terminal, np.reshape(s2, (self.actor.s_dim, ))) # Learn, if enough samples are available in memory if self.replay_buffer.size() > self.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch) target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() # Roll over last state and action self.last_state = s2 ''' self.ep_reward +=r if terminal: summary_str = self.sess.run( , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)}) writer.add_summary(summary_str, i) #writer.flush() print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \ (self.ep_ave_max_q / float(j)))) ''' def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor.predict(states)[0] #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size)) #print(actions) return actions + self.noise.sample() # add some noise for exploration def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch): target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() def build_summaries(self): episode_reward = tf.Variable(0.) tf.summary.scalar("Reward", episode_reward) episode_ave_max_q = tf.Variable(0.) tf.summary.scalar("Qmax Value", episode_ave_max_q) summary_vars = [episode_reward, episode_ave_max_q] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars
''' DEFINE THE ACTOR RL AGENT ''' if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) print("Initialized NAF") else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) print("Initialized DDPG actor") ''' DEFINE REPLAY BUFFER AND NOISE ''' memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) ''' ############################# Initialize the Evolution Part ############################# ''' evo = Evo(10) evo.initialize_fitness() # TODO: MOVE THE TRAINING CODE BELOW TO ITS RESPECTIVE FUNCTIONS rewards = [] # during training rewards_test_ERL = [] # during testing ERL policy rewards_test_DDPG = [] print("Number of hidden units = " + str(args.hidden_size)) print("Batch size = " + str(args.batch_size))
class Agent(object): def __init__(self, state_space, action_space, max_action, device): self.state_size = state_space.shape[0] self.action_size = action_space.shape[0] self.max_action = max_action self.device = device self.actor_local = Actor(state_space.shape, action_space.high.size, max_action) self.actor_target = Actor(state_space.shape, action_space.high.size, max_action) self.actor_optimizer = optimizers.Adam(LR_ACTOR) # let target be equal to local self.actor_target.set_weights(self.actor_local.get_weights()) self.critic_local = Critic(state_space.shape, action_space.high.size) self.critic_target = Critic(state_space.shape, action_space.high.size) self.critic_optimizer = optimizers.Adam(LR_CRITIC) # let target be equal to local self.critic_target.set_weights(self.critic_local.get_weights()) self.noise = OUNoise(self.action_size) self.memory = ReplayBuffer(BUFFER_SIZE) self.current_steps = 0 def step(self, state, action, reward, done, next_state, train=True) -> None: self.memory.store(state, action, reward, done, next_state) if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE: if self.current_steps % UPDATE_STEPS == 0: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) self.current_steps += 1 @tf.function def critic_train(self, states, actions, rewards, dones, next_states): with tf.device(self.device): # Compute yi u_t = self.actor_target(next_states) q_t = self.critic_target([next_states, u_t]) yi = tf.cast(rewards, dtype=tf.float64) + \ tf.cast(GAMMA, dtype=tf.float64) * \ tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \ tf.cast(q_t, dtype=tf.float64) # Compute MSE with tf.GradientTape() as tape: q_l = tf.cast(self.critic_local([states, actions]), dtype=tf.float64) loss = (q_l - yi) * (q_l - yi) loss = tf.reduce_mean(loss) # Update critic by minimizing loss dloss_dql = tape.gradient(loss, self.critic_local.trainable_weights) self.critic_optimizer.apply_gradients( zip(dloss_dql, self.critic_local.trainable_weights)) return @tf.function def actor_train(self, states): with tf.device(self.device): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.actor_local.trainable_variables) u_l = self.actor_local(states) q_l = -tf.reduce_mean(self.critic_local([states, u_l])) j = tape.gradient(q_l, self.actor_local.trainable_variables) self.actor_optimizer.apply_gradients( zip(j, self.actor_local.trainable_variables)) return def learn(self, experiences, gamma) -> None: states, actions, rewards, dones, next_states = experiences states = np.array(states).reshape(BATCH_SIZE, self.state_size) states = tf.convert_to_tensor(states) actions = np.array(actions).reshape(BATCH_SIZE, self.action_size) actions = tf.convert_to_tensor(actions) rewards = np.array(rewards).reshape(BATCH_SIZE, 1) next_states = np.array(next_states).reshape(BATCH_SIZE, self.state_size) dones = np.array(dones).reshape(BATCH_SIZE, 1) self.critic_train(states, actions, rewards, dones, next_states) self.actor_train(states) self.update_local() return def update_local(self): def soft_updates(local_model: tf.keras.Model, target_model: tf.keras.Model) -> np.ndarray: local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = TAU * local_weights + (1 - TAU) * target_weights return new_weights self.actor_target.set_weights( soft_updates(self.actor_local, self.actor_target)) self.critic_target.set_weights( soft_updates(self.critic_local, self.critic_target)) def store_weights(self, episode: int) -> None: self.actor_target.save_weights( join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}')) self.critic_target.save_weights( join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}')) return def act(self, state, add_noise=True) -> (float, float): state = np.array(state).reshape(1, self.state_size) pure_action = self.actor_local.predict(state)[0] action = self.noise.get_action(pure_action) return action, pure_action def reset(self): self.noise.reset()
'done_comparison_data': done_comparison_data, 'scores': scores }) # Actions generation exploration_mu = 0 exploration_theta = 0.15 exploration_sigma = 0.2 action_size = 3 action_low = np.array([1, 0, 1]) action_high = np.array([10, 359, 2000]) action_range = action_high - action_low #Start with random action action = np.array([np.random.uniform() for _ in action_low]) noise = OUNoise(action.shape[0], exploration_mu, exploration_theta, exploration_sigma) time_limit = 10 for i in range(10): start_time = time.time() env.reset() done = False j = 0 while not done: j += 1 ns = noise.sample() action = action + ns v_size, angle, speed = np.array(transform_action( action, action_range, action_low), dtype='uint8')
writer = SummaryWriter() env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset()