def __init__(self, action_size: int, state_dim: int, action_dim: int, gamma: float, sess: tf.Session, optimizer: tf.train.Optimizer = tf.train.AdamOptimizer( learning_rate=0.001), max_tf_checkpoints_to_keep: int = 3, experience_size: int = 1000, per: bool = False, batch_size: int = 64, start_steps: int = 2000): self.optimizer = optimizer self.sess = sess self.gamma = gamma self.action_dim = action_dim self.state_dim = state_dim self.action_size = action_size self.per = per self.actor = ActorNetwork(action_size=action_size, state_dim=state_dim, action_dim=action_dim, sess=sess, optimizer=optimizer) self.critic = CriticNetwork(action_size=action_size, state_dim=state_dim, action_dim=action_dim, sess=sess, optimizer=optimizer, gamma=gamma) self.eval_mode = False self.t = 0 self.start_steps = start_steps self.training_steps = 0 self.epsilon = 1 self.batch_size = batch_size self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep) if self.per: self._replay = PER(experience_size) else: self._replay = ReplayBuffer(experience_size) self._last_state = None self._last_items = None self._last_action = None self.td_losses = [] self.qvalues = []
def __init__(self, sess, state_dim, action_dim, epsilon=0.4, action_size=4, logdir='./logs/', replay_size=1000, batch_size=64): self._state_dim = state_dim self._action_dim = action_dim self._action_size = action_size self._logdir = logdir self._sess = sess self.epsilon = epsilon self.gamma = 0.9 self.lr = 1e-4 self.optimizer = tf.train.AdadeltaOptimizer(self.lr) self.state, self.action, self.agent, self.weights = self._create_network( 'agent') self.qvalues = self.agent(tf.concat([self.state, self.action], axis=-1)) self.target_state, self.target_action, self.target, self.target_weights = self._create_network( 'target') self.target_qvalues = self.target( tf.concat([self.target_state, self.target_action], axis=-1)) self.train_op, self.td_loss = self._create_train_op() self.target_update_op = self._create_target_update_op() self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self._logdir, self._sess.graph) self.summary = None self._replay = ReplayBuffer(replay_size) self.batch_size = batch_size self.td_losses = [] self._last_state = None self._last_items = None self._last_action = None self.eval_mode = False self.training_steps = 0
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 #0 self.exploration_theta = 0.125 # 0.14 | 0.1 self.exploration_sigma = 0.0009 # 0.001 | 0.2 | 0.001 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.998 # 0.99 | 0.9 | discount factor self.tau = 0.099 # 0.001| 0.01 | 0.1 | 0.05 | for soft update of target parameters # Score tracker self.best_score = -np.inf self.score = 0
def __init__( self, state_size, network_id: str, buffer_size: int = int(20000), batch_size: int = 64, gamma: float = 0.99, lr: float = 1e-4, train_freq: int = 4, target_update_freq: int = 1000, min_epsilon: float = 0.05, epsilon_decay: float = 0.0005, scheduler_id: str = 'linear', **kwargs, ): super().__init__(**kwargs) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.batch_size = batch_size self.gamma = gamma self.lr = lr self.train_freq = train_freq self.epsilon_scheduler = scheduler_hub[scheduler_id]( 1, epsilon_decay, min_epsilon ) self.t_step = 0 self.memory = ReplayBuffer(buffer_size, batch_size, self.device) self.episode_logs = {} # self.future_network = FutureNetwork() # self.reward_network = RewardNetwork() # self.opt_future_reward = optim.RMSprop(self.future_network.parameters(), lr=self.lr) # self.keypoint_network = KeypointNetwork() # self.opt_keypoint = optim.RMSprop(self.future_network.parameters(), lr=self.lr) self.state_encoder = StateEncoder(state_size).to(self.device) self.state_decoder = StateDecoder(state_size).to(self.device) self.opt_ae = optim.RMSprop( list(self.state_encoder.parameters()) + list(self.state_decoder.parameters()), lr=self.lr, )
gif_req_m = mp.Value('i', -1) data_proc_list = [] for _ in range(hp.N_ROLLOUT_PROCESSES): data_proc = mp.Process(target=data_func, args=(pi, device, exp_queue, finish_event, gif_req_m, hp)) data_proc.start() data_proc_list.append(data_proc) # Training tgt_Q = TargetCritic(Q) pi_opt = optim.Adam(pi.parameters(), lr=hp.LEARNING_RATE) Q_opt = optim.Adam(Q.parameters(), lr=hp.LEARNING_RATE) alpha_optim = optim.Adam([log_alpha], lr=hp.LEARNING_RATE) buffer = ReplayBuffer(buffer_size=hp.REPLAY_SIZE, observation_space=hp.observation_space, action_space=hp.action_space, device=hp.DEVICE) n_grads = 0 n_samples = 0 n_episodes = 0 best_reward = None last_gif = None try: while n_grads < hp.TOTAL_GRAD_STEPS: metrics = {} ep_infos = list() st_time = time.perf_counter() # Collect EXP_GRAD_RATIO sample for each grad step new_samples = 0 while new_samples < hp.EXP_GRAD_RATIO:
def main(args): device = "cuda" if args.cuda else "cpu" mp.set_start_method('spawn') # Input Experiment Hyperparameters hp = SACHP(EXP_NAME=args.name, DEVICE=device, ENV_NAME=args.env, N_ROLLOUT_PROCESSES=3, LEARNING_RATE=0.0001, EXP_GRAD_RATIO=10, BATCH_SIZE=256, GAMMA=0.95, REWARD_STEPS=3, ALPHA=0.015, LOG_SIG_MAX=2, LOG_SIG_MIN=-20, EPSILON=1e-6, REPLAY_SIZE=100000, REPLAY_INITIAL=512, SAVE_FREQUENCY=100000, GIF_FREQUENCY=100000, TOTAL_GRAD_STEPS=1000000) wandb.init(project='RoboCIn-RL', name=hp.EXP_NAME, entity='robocin', config=hp.to_dict()) current_time = datetime.datetime.now().strftime('%b-%d_%H-%M-%S') tb_path = os.path.join( 'runs', current_time + '_' + hp.ENV_NAME + '_' + hp.EXP_NAME) # Training sac = SAC(hp) buffer = ReplayBuffer(buffer_size=hp.REPLAY_SIZE, observation_space=hp.observation_space, action_space=hp.action_space, device=hp.DEVICE) # Playing sac.share_memory() exp_queue = mp.Queue(maxsize=hp.EXP_GRAD_RATIO) finish_event = mp.Event() gif_req_m = mp.Value('i', -1) data_proc = mp.Process(target=rollout, args=(sac, device, exp_queue, finish_event, gif_req_m, hp)) data_proc.start() n_grads = 0 n_samples = 0 n_episodes = 0 best_reward = None last_gif = None try: while n_grads < hp.TOTAL_GRAD_STEPS: metrics = {} ep_infos = list() st_time = time.perf_counter() # Collect EXP_GRAD_RATIO sample for each grad step new_samples = 0 while new_samples < hp.EXP_GRAD_RATIO: exp = exp_queue.get() if exp is None: raise Exception # got None value in queue safe_exp = copy.deepcopy(exp) del (exp) # Dict is returned with end of episode info if isinstance(safe_exp, dict): logs = { "ep_info/" + key: value for key, value in safe_exp.items() if 'truncated' not in key } ep_infos.append(logs) n_episodes += 1 else: if safe_exp.last_state is not None: last_state = safe_exp.last_state else: last_state = safe_exp.state buffer.add(obs=safe_exp.state, next_obs=last_state, action=safe_exp.action, reward=safe_exp.reward, done=False if safe_exp.last_state is not None else True) new_samples += 1 n_samples += new_samples sample_time = time.perf_counter() # Only start training after buffer is larger than initial value if buffer.size() < hp.REPLAY_INITIAL: continue # Sample a batch and load it as a tensor on device batch = buffer.sample(hp.BATCH_SIZE) metrics["train/loss_pi"], metrics["train/loss_Q1"], \ metrics["train/loss_Q2"], metrics["train/loss_alpha"], \ metrics["train/alpha"] = sac.update(batch=batch, metrics=metrics) n_grads += 1 grad_time = time.perf_counter() metrics['speed/samples'] = new_samples / (sample_time - st_time) metrics['speed/grad'] = 1 / (grad_time - sample_time) metrics['speed/total'] = 1 / (grad_time - st_time) metrics['counters/samples'] = n_samples metrics['counters/grads'] = n_grads metrics['counters/episodes'] = n_episodes metrics["counters/buffer_len"] = buffer.size() if ep_infos: for key in ep_infos[0].keys(): metrics[key] = np.mean([info[key] for info in ep_infos]) # Log metrics wandb.log(metrics) if hp.SAVE_FREQUENCY and n_grads % hp.SAVE_FREQUENCY == 0: save_checkpoint(hp=hp, metrics={ 'alpha': sac.alpha, 'n_samples': n_samples, 'n_grads': n_grads, 'n_episodes': n_episodes }, pi=sac.pi, Q=sac.Q, pi_opt=sac.pi_opt, Q_opt=sac.Q_opt) if hp.GIF_FREQUENCY and n_grads % hp.GIF_FREQUENCY == 0: gif_req_m.value = n_grads except KeyboardInterrupt: print("...Finishing...") finish_event.set() finally: if exp_queue: while exp_queue.qsize() > 0: exp_queue.get() print('queue is empty') print("Waiting for threads to finish...") data_proc.terminate() data_proc.join() del (exp_queue) del (sac) finish_event.set()
def training_loop(agent_config, env_config, vaccination_schedule): action_dim = len(env_config["groups"]) * 2 * len(vaccination_schedule) state_dim = len(env_config["groups"]) + len(vaccination_schedule) * len( env_config["groups"]) + len(vaccination_schedule) batch_size = 100 episodes = 150 max_iterations = env_config["max_time_steps"] env = VaccinationEnvironment(env_config, vaccination_schedule) agent = TD3Agent([state_dim + action_dim, 256, 256, 1], [state_dim, 256, 256, action_dim]) replay_buffer = ReplayBuffer(state_dim, action_dim) # get starting state state = [] for items in list(env.get_cases().values()): for value in items: state.append(value) state.append(env.get_vaccines()[0].num) # first sample enough data for i in range(2 * batch_size): available_vaccines = env.get_vaccines()[0].num action = agent.act(state) vaccination_plan = [] for index in range(0, len(action), 2): group_vac_1 = int(action[index] * available_vaccines) group_vac_2 = int(action[index + 1] * available_vaccines) plan = Vaccination_Plan(JOHNSON, group_vac_1, group_vac_2) vaccination_plan.append([plan]) info, done = env.step(vaccination_plan, False) reward = -sum([values[1] for values in info.values()]) next_state = [] for items in list(env.get_cases().values()): for value in items: next_state.append(value) next_state.append(available_vaccines) replay_buffer.add(state, action, next_state, reward, done) state = next_state # training rewards = [] losses = [] # get starting state for i in tqdm(range(episodes)): episode_reward = [] episode_loss = [] env.reset() state = [] for items in list(env.get_cases().values()): for value in items: state.append(value) state.append(env.get_vaccines()[0].num) for j in range(max_iterations): available_vaccines = env.get_vaccines()[0].num action = agent.act(state) vaccination_plan = [] for index in range(0, len(action), 2): group_vac_1 = int(action[index] * available_vaccines) group_vac_2 = int(action[index + 1] * available_vaccines) plan = Vaccination_Plan(JOHNSON, group_vac_1, group_vac_2) vaccination_plan.append([plan]) info, done = env.step(vaccination_plan, True) reward = sum([values[1] for values in info.values()]) episode_reward.append(reward) next_state = [] for items in list(env.get_cases().values()): for value in items: next_state.append(value) next_state.append(available_vaccines) replay_buffer.add(state, action, next_state, reward, done) state = next_state # train train_states, train_actions, train_next_states, train_reward, train_done = replay_buffer.sample( batch_size) loss = agent.train(train_states, train_actions, train_next_states, train_reward, train_done) episode_loss.append(loss.detach().numpy()) if done: rewards.append(sum(episode_reward)) losses.append(sum(episode_loss) / len(episode_loss)) break # finally save data: data = pd.DataFrame(data={"rewards": rewards, "losses": losses}) data.to_csv( "P:/Dokumente/3 Uni/WiSe2021/Hackathon/Hackathon_KU/exp/performance.csv", sep=",", index=False)
target=data_func, args=(pi, device, exp_queue, finish_event, sigma_m, gif_req_m, hp), ) data_proc.start() data_proc_list.append(data_proc) pi_opt = optim.Adam(pi.parameters(), lr=hp.LEARNING_RATE) Q_opt = optim.Adam(Q.parameters(), lr=hp.LEARNING_RATE) # Training tgt_pi = TargetActor(pi) tgt_Q = TargetCritic(Q) tgt_Q_strat = TargetCritic(Q_strat) Q_strat_opt = optim.Adam(Q_strat.parameters(), lr=hp.LEARNING_RATE) buffer = ReplayBuffer( buffer_size=hp.REPLAY_SIZE, observation_space=hp.observation_space, action_space=hp.action_space, device=hp.DEVICE, n_rew=hp.N_REWARDS, ) n_grads = 0 n_samples = 0 n_episodes = 0 best_reward = None last_gif = None try: alphas = torch.Tensor([0.6600, 0.3200, 0.0053, 0.0080]).to(device) while n_grads < hp.TOTAL_GRAD_STEPS: metrics = {} ep_infos = list() st_time = time.perf_counter()
def __init__(self, state_size, action_size, num_agents, device, seed=23520, GRADIENT_CLIP=1, ACTIVATION=F.relu, BOOTSTRAP_SIZE=5, GAMMA=0.99, TAU=1e-3, LR_CRITIC=5e-4, LR_ACTOR=5e-4, UPDATE_EVERY=1, TRANSFER_EVERY=2, UPDATE_LOOP=10, ADD_NOISE_EVERY=5, WEIGHT_DECAY=0, MEMORY_SIZE=5e4, BATCH_SIZE=64): """Initialize an Agent object. Params ====== state_size : dimension of each state action_size : dimension of each action num_agents : number of running agents device: cpu or cuda:0 if available -----These are hyperparameters---- BOOTSTRAP_SIZE : How far ahead to bootstrap GAMMA : Discount factor TAU : Parameter for performing soft updates of target parameters LR_CRITIC, LR_ACTOR : Learning rate of the networks UPDATE_EVERY : How often to update the networks TRANSFER_EVERY : How often to transfer the weights from local to target UPDATE_LOOP : Number of iterations for network update ADD_NOISE_EVERY : How often to add noise to favor exploration WEIGHT_DECAY : L2 weight decay for critic optimizer GRADIENT_CLIP : Limit of gradient to be clipped, to avoid exploding gradient issue """ # Actor networks self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY) hard_update(self.actor_local, self.actor_target) #critic networks self.critic_local = Critic(state_size * 2, action_size).to(device) self.critic_target = Critic(state_size * 2, action_size).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) hard_update(self.critic_local, self.critic_target) self.device = device self.num_agents = num_agents # Noise : using simple noise instead of OUNoise self.noise = [ SimpleNoise(action_size, scale=1) for i in range(num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, device, int(MEMORY_SIZE), BATCH_SIZE, seed) # Initialize time steps (for updating every UPDATE_EVERY steps) self.u_step = 0 self.n_step = 0 #keeping hyperparameters within the instance self.BOOTSTRAP_SIZE = BOOTSTRAP_SIZE self.GAMMA = GAMMA self.TAU = TAU self.LR_CRITIC = LR_CRITIC self.LR_ACTOR = LR_ACTOR self.UPDATE_EVERY = UPDATE_EVERY self.TRANSFER_EVERY = TRANSFER_EVERY self.UPDATE_LOOP = UPDATE_LOOP self.ADD_NOISE_EVERY = ADD_NOISE_EVERY self.GRADIENT_CLIP = GRADIENT_CLIP # initialize these variables to store the information of the n-previous timestep that are necessary to apply the bootstrap_size self.rewards = deque(maxlen=BOOTSTRAP_SIZE) self.states = deque(maxlen=BOOTSTRAP_SIZE) self.actions = deque(maxlen=BOOTSTRAP_SIZE) self.gammas = np.array([[GAMMA**i for j in range(num_agents)] for i in range(BOOTSTRAP_SIZE)]) self.loss_function = torch.nn.SmoothL1Loss()