def __init__(self, in_actor, out_actor, in_critic, # e.g. = n_agent * (state_size + action_size) lr_actor=1e-4, lr_critic=1e-3, # better learn faster than actor random_seed=2): self.state_size = in_actor self.action_size = out_actor self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "optimizer": "adam"} self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor) # for a single agent, critic takes global observations as input, and output action-value Q # e.g. global_states = all_states + all_actions self.local_critic = Critic(in_shape=in_critic).to(device) self.target_critic = Critic(in_shape=in_critic).to(device) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic) # Q: should local/target start with same weights ? synchronized after first copy after all # A: better hard copy at the beginning hard_update_A_from_B(self.target_actor, self.local_actor) hard_update_A_from_B(self.target_critic, self.local_critic) # Noise process self.noise = OUNoise(out_actor, scale=1.0)
def __init__(self, actor_size, action_size, critic_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(actor_size, action_size).to(self.device) self.actor_target = Actor(actor_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(critic_size).to(self.device) self.critic_target = Critic(critic_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.gamma = 0.95 #0.99 self.tau = 0.001 self.noise = OUNoise((action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0)
def __init__(self, sess, dimo, dimu, u_bound, critic_lr, actor_lr, critic_l2, clip_norm, tau, layer_norm, noisy_layer, gamma, memory_size, exploration, batch_size, env_dt): self._sess = sess self._dimo = dimo self._dimu = dimu self._critic_l2 = critic_l2 self._actor_lr = actor_lr self._critic_lr = critic_lr self._clip_norm = clip_norm self._noisy = noisy_layer self._gamma = gamma self._tau = tau self._batch_size = batch_size self._u_bound = u_bound self._global_step = tf.train.get_or_create_global_step() self.ou_noise = OUNoise(dim=dimu, n_step_annealing=exploration, dt=env_dt) self._memory = ReplayMemory(memory_size) with tf.variable_scope('inputs'): self._obs = tf.placeholder(tf.float32, [None, self._dimo], name='state') self._u = tf.placeholder(tf.float32, [None, self._dimu], name='action') self._t_obs = tf.placeholder(tf.float32, [None, self._dimo], name='target_state') with tf.variable_scope('actor'): self._actor = Actor('main', self._obs, dimu, layer_norm, noisy_layer) self._target_actor = Actor('target', self._t_obs, dimu, layer_norm, noisy_layer) with tf.variable_scope('critic'): self._critic = Critic('main', self._obs, self._u, layer_norm, noisy_layer) self._critic_pi = Critic('main', self._obs, U.scaling(self._actor.pi, -1.0, 1.0, self._u_bound['low'], self._u_bound['high']), layer_norm, noisy_layer, reuse=True) self._target_critic = Critic( 'target', self._t_obs, U.scaling(self._target_actor.pi, -1.0, 1.0, self._u_bound['low'], self._u_bound['high']), layer_norm, noisy_layer) self._build_train_method() self._update_target_op = self._update_target_networks()
def __init__(self, state_size: int, action_size: int, num_agents: int, epsilon, random_seed: int): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) :param action_size: dimension of action (output) :param num_agents: number of concurrent agents in the environment :param epsilon: initial value of epsilon for exploration :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.t_step = 0 # Hyperparameters self.buffer_size = 1000000 self.batch_size = 128 self.update_every = 10 self.num_updates = 10 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.0001 self.lr_critic = 0.001 self.weight_decay = 0 self.epsilon = epsilon self.epsilon_decay = 0.97 self.epsilon_min = 0.005 # Networks (Actor: State -> Action, Critic: (State,Action) -> Value) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize actor and critic networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, random_seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, state_dim, action_dim, action_lim, update_type='soft', lr_actor=1e-4, lr_critic=1e-3, tau=1e-3, mem_size=1e6, batch_size=256, gamma=0.99, other_cars=False, ego_dim=None): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.joint_model = False if len(state_dim) == 3: self.model = ActorCriticCNN(state_dim, action_dim, action_lim) self.model_optim = optim.Adam(self.model.parameters(), lr=lr_actor) self.target_model = ActorCriticCNN(state_dim, action_dim, action_lim) self.target_model.load_state_dict(self.model.state_dict()) self.model.to(self.device) self.target_model.to(self.device) self.joint_model = True else: self.actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.actor_optim = optim.Adam(self.actor.parameters(), lr=lr_actor) self.target_actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_actor.eval() self.critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.critic_optim = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2) self.target_critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.target_critic.load_state_dict(self.critic.state_dict()) self.target_critic.eval() self.actor.to(self.device) self.target_actor.to(self.device) self.critic.to(self.device) self.target_critic.to(self.device) self.action_lim = action_lim self.tau = tau # hard update if tau is None self.update_type = update_type self.batch_size = batch_size self.gamma = gamma if self.joint_model: mem_size = mem_size//100 self.memory = Memory(int(mem_size), action_dim, state_dim) mu = np.zeros(action_dim) sigma = np.array([0.5, 0.05]) self.noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.target_noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.initialised = True self.training = False
def __init__( self, state_size, action_size, sample_batch_size, memory_size=int(1e5), # replay buffer size batch_size=128, # minibatch size gamma=0.99, # discount factor tau=1e-3, # for soft update of target parameters update_every=10, lr_actor=1e-4, lr_critic=1e-3, random_seed=2): self.sample_batch_size = sample_batch_size self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.params = { "lr_actor": lr_actor, "lr_critic": lr_critic, "gamma": gamma, "tau": tau, "memory_size": memory_size, "batch_size": batch_size, "optimizer": "adam" } self.actor_local = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_target = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_local = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_target = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) self.memory = ReplayBuffer(action_size, memory_size, batch_size, random_seed) # Noise process self.noise = OUNoise([sample_batch_size, action_size], random_seed) self.learn_steps = 0 self.update_every = update_every
def __init__(self, env_name: str, threads: int, episodes: int, entropy_weight: float, learning_rate: Union[ float, tf.keras.optimizers.schedules.LearningRateSchedule], discount_factor: float): self.env_name = env_name env = gym.make(env_name) self.save_dir = os.path.expanduser('~/keras-a3c/models/') self.threads = threads self.EPISODES = episodes self.entropy_weight = entropy_weight self.learning_rate = learning_rate self.discount_factor = discount_factor actor = Actor(action_space_size=env.action_space.n) critic = Critic() self.global_model = ActorCriticModel(actor, critic) self.actor_loss = ActorLoss(entropy_weight) self.optimizer = tf.keras.optimizers.RMSprop(lr=learning_rate) self.global_model( tf.convert_to_tensor( np.random.random((1, env.observation_space.shape[0]))))
def __init__(self, observation_space, action_space, lr_actor, lr_critic, gamma, device='cpu', discrete=False, project_dim=8): """ Parameters ---------- observation_space: int Number of flattened entries of the state action_space: int Number of (discrete) possible actions to take """ self.gamma = gamma self.n_actions = action_space self.discrete = discrete if self.discrete: self.actor = DiscreteActor(observation_space, action_space, project_dim) self.critic = DiscreteCritic(observation_space, project_dim) else: self.actor = Actor(observation_space, action_space) self.critic = Critic(observation_space) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) self.device = device
def __init__(self, image_size, input_channels, hidden_channels, output_channels, latent_dimension, lr, device, clamp=0.01, gp_weight=10): self.image_size = image_size self.input_channels = input_channels self.hidden_chanels = hidden_channels self.output_channels = output_channels self.latent_dimension = latent_dimension self.device = device self.clamp = clamp self.gp_weight = gp_weight self.critic = Critic(image_size, hidden_channels, input_channels).to(device) self.generator = Generator( image_size, latent_dimension, hidden_channels, output_channels).to(device) self.critic.apply(self.weights_init) self.generator.apply(self.weights_init) self.optimizer_critic = torch.optim.RMSprop( self.critic.parameters(), lr) self.optimizer_gen = torch.optim.RMSprop( self.generator.parameters(), lr) self.optimizer_critic = torch.optim.Adam( self.critic.parameters(), lr, betas=(0, 0.9)) self.optimizer_gen = torch.optim.Adam( self.generator.parameters(), lr, betas=(0, 0.9)) self.critic_losses = [] self.gen_losses = [] self.losses = []
def get_models(latent_dim, model_dim, device, output_dim, channels, init=True): generator = Generator(latent_dim, model_dim, channels).to(device) critic = Critic(model_dim, output_dim, channels).to(device) if init: generator.apply(__weights_init_normal) critic.apply(__weights_init_normal) return generator, critic
def __init__(self, state_dim, action_dim, num_shared, device): self.state_dim = state_dim self.action_dim = action_dim self.device = device self.actor = Actor(state_dim, action_dim, num_shared).to(device) self.critic = Critic(state_dim, num_shared).to(device)
def __init__(self, policy: str, action_dim: int, max_action: float, lr: float, discount: float, noise_clip: float, policy_noise: float, policy_freq: int, actor_rng: jnp.ndarray, critic_rng: jnp.ndarray, sample_state: np.ndarray): self.discount = discount self.noise_clip = noise_clip self.policy_noise = policy_noise self.policy_freq = policy_freq self.max_action = max_action self.td3_update = policy == 'TD3' self.actor = hk.transform(lambda x: Actor(action_dim, max_action)(x)) actor_opt_init, self.actor_opt_update = optix.adam(lr) self.critic = hk.transform(lambda x: Critic()(x)) critic_opt_init, self.critic_opt_update = optix.adam(lr) self.actor_params = self.target_actor_params = self.actor.init( actor_rng, sample_state) self.actor_opt_state = actor_opt_init(self.actor_params) action = self.actor.apply(self.actor_params, sample_state) self.critic_params = self.target_critic_params = self.critic.init( critic_rng, jnp.concatenate((sample_state, action), 0)) self.critic_opt_state = critic_opt_init(self.critic_params) self.updates = 0
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = [ OUNoise(action_size, random_seed, sigma=0.1) for i in range(self.num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0
def __init__(self, state_dim, action_dim, device): self.state_dim = state_dim self.action_dim = action_dim self.device = device self.actor = Actor(state_dim, action_dim).to(device) self.critic = Critic(state_dim).to(device) self.optimizer = torch.optim.Adam( itertools.chain(self.actor.parameters(), self.critic.parameters()), LR) self.philosophers = list() for i in range(P_COUNT): self.philosophers.append(Critic(state_dim).to(device)) self.p_optimizers = [ torch.optim.Adam(p.parameters(), lr=P_LR) for p in self.philosophers ] self.update_cnt = 0
def __init__(self, num_agents, state_size, action_size, random_seed=2018): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = torch.device('cuda' if cuda else 'cpu') # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
def __init__(self): self.max_action = 1 self.policy_freq = 2 self.policy_freq_it = 0 self.batch_size = 512 self.discount = 0.99 self.replay_buffer = int(1e5) self.device = 'cuda' self.state_dim = 24 self.action_dim = 2 self.max_action = 1 self.policy_noise = 0.1 self.agents = 1 self.random_period = 1e4 self.tau = 5e-3 self.replay_buffer = ReplayBuffer(self.replay_buffer) self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4) # self.actor.load_state_dict(torch.load('actor2.pth')) # self.actor_target.load_state_dict(torch.load('actor2.pth')) self.noise = OUNoise(2, 32) self.critic = Critic(48, self.action_dim).to(self.device) self.critic_target = Critic(48, self.action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
def __init__(self, env, hidden_size=256, actor_lr=1e-4, critic_lr=1e-3, gamma=0.99, tau=1e-3, max_memory=int(1e6)): obs = env.reset() self.num_states = obs['desired_goal'].shape[0] + obs[ 'observation'].shape[0] self.num_actions = env.action_space.shape[0] self.gamma = gamma self.tau = tau self.action_max = env.action_space.high[0] self.actor = Actor(self.num_states, hidden_size, self.num_actions) self.critic = Critic(self.num_states + self.num_actions, hidden_size, 1) self.target_actor = Actor(self.num_states, hidden_size, self.num_actions) self.target_critic = Critic(self.num_states + self.num_actions, hidden_size, 1) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.experience_replay = ExperienceReplay(max_memory) self.critic_loss_func = nn.MSELoss() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
def __init__(self, device, state_size, action_size, folder, config): self.folder = folder self.config = config self.device = device self.memory = ReplayMemory(self.config["MEMORY_CAPACITY"]) self.state_size = state_size self.action_size = action_size self.critic = Critic(self.state_size, self.action_size, self.device, self.config) self.actor = Actor(self.state_size, self.action_size, self.device, self.config)
def __init__(self, device, memory, state_size, action_size, low_bound, high_bound, folder, config): self.folder = folder self.config = config self.device = device self.memory = memory self.state_size = state_size self.action_size = action_size self.low_bound = low_bound self.high_bound = high_bound self.critic = Critic(state_size, action_size, device, self.config) self.actor = Actor(state_size, action_size, low_bound, high_bound, device, self.config)
def __init__(self, state_size, action_size, seed): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) for this decentralized actor :param action_size: dimension of action (output) for this decentralized actor :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.seed = seed self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Hyperparameters self.buffer_size = 100000 self.batch_size = 256 self.gamma = 0.99 self.tau = 0.01 self.lr_actor = 0.0001 self.lr_critic = 0.001 # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value) self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic) # Initialize local and taret networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, self.seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, state_size, action_size, fc1_units, fc2_units): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = torch.manual_seed(SEED) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, fc1_units, fc2_units).to(device) self.critic_target = Critic(state_size, action_size, fc1_units, fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OrnsteinUhlenbeck(action_size, SEED) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, SEED, device)
def __init__(self, env, log_dir, gamma=0.99, batch_size=64, sigma=0.2, batch_norm=True, merge_layer=2, buffer_size=int(1e6), buffer_min=int(1e4), tau=1e-3, Q_wd=1e-2, num_episodes=1000): self.s_dim = env.reset().shape[0] # self.a_dim = env.action_space.shape[0] self.a_dim = env.action_space2.shape[0] # self.a_dim = 1 self.env = env # self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm) self.mu = Actor(self.s_dim, self.a_dim, env.action_space2, batch_norm=batch_norm) self.Q = Critic(self.s_dim, self.a_dim, batch_norm=batch_norm, merge_layer=merge_layer) self.targ_mu = copy.deepcopy(self.mu).eval() self.targ_Q = copy.deepcopy(self.Q).eval() self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim), sigma=sigma * torch.ones(self.a_dim)) self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim) self.buffer_min = buffer_min self.mse_fn = torch.nn.MSELoss() self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3, weight_decay=Q_wd) self.gamma = gamma self.batch_size = batch_size self.num_episodes = num_episodes self.tau = tau self.log_dir = log_dir self.fill_buffer()
def __init__(self, args): self.args = args self.critic = Critic(args.dim_s, args.dim_a, args.dim_h, args.device) self.optim = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.scheduler_lr = torch.optim.lr_scheduler.StepLR(self.optim, step_size=1000, gamma=0.9, last_epoch=-1) self.predicate = get_predicate() f = get_formula() self.formula = add_w_to_formula(f, [0 for i in f]) self.database = [] self.data, self.mln = self.model_config(self.predicate, self.formula, self.database, 'TicTacToe.mln', 'TicTacToe.db') self.state_list = [] self.step = 0 self.action_list={0:{0:'Place(0,0)', 1:'Place(0,1)', 2:'Place(0,2)'}, 1:{0:'Place(1,0)', 1:'Place(1,1)', 2:'Place(1,2)'}, 2:{0:'Place(2,0)', 1:'Place(2,1)', 2:'Place(2,2)'}} self.EPSILON = 1
def __init__(self, args, env): self.learning_rate = args.learning_rate self.gamma = args.gamma self.lamb = args.lamb self.batch_size = args.batch_size self.step = 0 self.epochs = args.epochs self.actor = Actor() self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.learning_rate) self.critic = Critic() self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.learning_rate) self.env = env self.num_actions = env.num_actions self.num_states = env.num_states self.data = {'step' : [], 'reward' : [], 'losses' : []}
def __init__(self, env_name: str, save_dir: str, entropy_weight: float, discount_factor: float, id: int): import tensorflow as tf self.env_name = env_name self.env = gym.make(env_name) self.id = id self.save_dir = save_dir self.discount_factor = discount_factor actor = Actor(action_space_size=self.env.action_space.n) critic = Critic() self.local_model = ActorCriticModel(actor, critic) self.actor_loss = ActorLoss(entropy_weight) self.local_model( tf.convert_to_tensor( np.random.random((1, self.env.observation_space.shape[0]))))
def update(self, trajectories): trajectories = map(self._compute_lambda_returns_and_gae, trajectories) transitions = sum( trajectories, []) # Turn a list of trajectories into list of transitions state, action, old_log_prob, target_value, advantage = zip( *transitions) state = torch.from_numpy(np.array(state)).float().to(self.device) action = torch.from_numpy(np.array(action)).float().to(self.device) old_log_prob = torch.from_numpy(np.array(old_log_prob)).float().to( self.device) target_value = torch.from_numpy(np.array(target_value)).float().to( self.device) advantage = torch.from_numpy(np.array(advantage)).float().to( self.device) for _ in range(BATCHES_PER_UPDATE): idx = np.random.randint(0, len(transitions), BATCH_SIZE) loss = self._calc_loss(state[idx], action[idx], old_log_prob[idx], target_value[idx], advantage[idx]) self.optimizer.zero_grad() for p_optimizer in self.p_optimizers: p_optimizer.zero_grad() loss.backward() self.optimizer.step() for p_optimizer in self.p_optimizers: p_optimizer.step() self.update_cnt += 1 if self.update_cnt % P_DELAY == 0: self.critic = self.philosophers[0] self.optimizer = self.p_optimizers[0] self.philosophers.pop(0) self.philosophers.append(Critic(self.state_dim).to(self.device)) self.p_optimizers.pop(0) self.p_optimizers.append( torch.optim.Adam(self.philosophers[-1].parameters(), lr=P_LR))
# Load the datasets X_set, Y_set = load_data() # Load infinite data X_data = get_infinite_X_data(X_set) Y_data = get_infinite_Y_data(Y_set) ######################################################## # Define device, neural nets, losses, optimizers, etc. # ######################################################## # Automatic GPU/CPU device placement device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Networks C_X = Critic().to(device) # Criticizes X data C_Y = Critic().to(device) # Criticizes Y data G = Generator(upsample=UPSAMPLE).to(device) # Translates X -> Y F = Generator(upsample=UPSAMPLE).to(device) # Translates Y -> X # Losses l1_loss = nn.L1Loss() # Optimizers C_X_optim = optim.Adam(C_X.parameters(), lr=LR, betas=(BETA1, BETA2)) C_Y_optim = optim.Adam(C_Y.parameters(), lr=LR, betas=(BETA1, BETA2)) G_optim = optim.Adam(G.parameters(), lr=LR, betas=(BETA1, BETA2)) F_optim = optim.Adam(F.parameters(), lr=LR, betas=(BETA1, BETA2)) ############### # Training ЪДа #
critic = torch.load(model_path + '_critic.pt') print('...done') run(actor, env, min_rate=0.05, writer=writer, render=True) else: # TRAIN MODE # Non-linearity is an argument non_linear = None if args.non_linear == 'relu': non_linear = torch.nn.ReLU() elif args.non_linear == 'elu': non_linear = torch.nn.ELU() # New actor and critic policies actor = Actor(use_gpu=use_gpu, non_linear=non_linear, batch_norm=args.batch_norm) critic = Critic(use_gpu=use_gpu, non_linear=non_linear, batch_norm=args.batch_norm) for i in range(args.num_train_cycles): print('Training cycle %s of %s' % (i, args.num_train_cycles)) act(actor, env, task, B, num_trajectories=args.num_trajectories, task_period=30, writer=writer) learn(actor, critic, task, B, num_learning_iterations=args.num_learning_iterations, episode_batch_size=args.episode_batch_size, lr=0.0002, writer=writer, loss=args.loss) run(actor, env, min_rate=0.05, writer=writer) # Remove early trajectories when buffer gets too large B = B[-args.buffer_size:] # Save the model to local directory
def run_seed(seed_params): ''' Runs a learning simulation for the seed in question. Parameters ---------- seed_params : dict Returns ------- Seed_rewards : np.array Reward history for the seed. Seed_entropies : np.array Entropy history for the seed. Loss_history : list History of the losses of the critic's value function. ''' env_params = seed_params['Env'] sim_params = seed_params['Sim'] hyper_params = seed_params['Hyp'] seed = seed_params['seed'] env = Environment(env_params, n_cyclists=sim_params['n_cyclists']) action_space = env.action_space np.random.seed(seed) state_normaliser = State_normaliser(env_params) nA = len(env.action_space) # Init weights model = [] for i in range(sim_params['n_cyclists']): model.append({ 'actor': Actor(env, hyper_params, seed), 'critic': Critic(env, hyper_params, seed) }) # Keep stats for final print of graph episode_rewards = [] episode_entropies = [] # Main loop for episode in range(sim_params['n_episodes']): cyclists_done = [] # Keep track of game score to print states = env.reset() states = [state_normaliser.normalise_state(state) for state in states] step_rewards = [[] for i in range(sim_params['n_cyclists'])] step_entropies = [[] for i in range(sim_params['n_cyclists'])] scores = [0] * sim_params['n_cyclists'] step = 0 while True: actions = [] for state in states: cyclist_number = state['number'] # Sample from policy and take action in environment if cyclist_number in cyclists_done: model[cyclist_number]['critic'].store_dead( state['state'], model[state['number']]['actor']) actions.append(0) else: try: probs, action, entropy = model[cyclist_number][ 'actor'].choose_action(state['state']) except ValueError: break step_entropies[cyclist_number].append(entropy) actions.append(action) model[cyclist_number]['critic'].store_transition_1( state['state'], action, entropy, probs) next_states, rewards, done, cyclists_done = env.step(actions) next_states = [ state_normaliser.normalise_state(state) for state in next_states ] for next_state, reward in zip(next_states, rewards): cyclist_number = next_state['number'] model[next_state['number']]['critic'].store_transition_2( reward, next_state['state'], model[cyclist_number]['actor']) states = next_states.copy() for state, reward in zip(states, rewards): scores[state['number']] += reward step_rewards[state['number']].append(reward) step += 1 if done: # print(state['state']) [ model[state['number']]['critic'].store_dead( state['state'], model[state['number']]['actor']) for state in states ] break episode_entropies.append( [np.mean(entrops) for entrops in step_entropies]) # Append for logging and print episode_rewards.append(scores) if sim_params['print_rewards']: print(f'Seed: {seed}, EP: {episode}, Score: {scores}', flush=True) print() Seed_entropies = np.array(episode_entropies) Seed_rewards = np.array(episode_rewards) return Seed_rewards, Seed_entropies, [c['critic'].losses for c in model]
def __init__(self, obs_shape, action_shape, device, model_kind, kind='D', step_MVE=5, hidden_dim=256, discount=0.99, init_temperature=0.01, alpha_lr=1e-3, alpha_beta=0.9, actor_lr=1e-3, actor_beta=0.9, actor_log_std_min=-10, actor_log_std_max=2, critic_lr=1e-3, critic_beta=0.9, critic_tau=0.005, critic_target_update_freq=2, model_lr=1e-3, log_interval=100): self.device = device self.discount = discount self.critic_tau = critic_tau self.critic_target_update_freq = critic_target_update_freq self.log_interval = log_interval self.step_MVE = step_MVE self.model_kind = model_kind self.actor = Actor(obs_shape, action_shape, hidden_dim, actor_log_std_min, actor_log_std_max).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, betas=(actor_beta, 0.999)) self.critic = Critic(obs_shape, action_shape, hidden_dim).to(device) self.critic_target = Critic(obs_shape, action_shape, hidden_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr, betas=(critic_beta, 0.999)) self.log_alpha = torch.tensor(np.log(init_temperature)).to(device) self.log_alpha.requires_grad = True self.target_entropy = -np.prod( action_shape) # set target entropy to -|A| self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr, betas=(alpha_beta, 0.999)) if self.model_kind == 'dynode_model': self.model = DyNODE(obs_shape, action_shape, hidden_dim_p=200, hidden_dim_r=200).to(device) elif self.model_kind == 'nn_model': self.model = NN_Model(obs_shape, action_shape, hidden_dim_p=200, hidden_dim_r=200, kind=kind).to(device) else: assert 'model is not supported' self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=model_lr) self.train() self.critic_target.train()