def __init__(self, model, args): self.model = model self.args = args self.iteration = 0 self.memory = Memory() if self.args.override or not os.path.isdir( self.args.output_dir) or self.args.output_dir == 'tmp': mkdir(self.args.output_dir, wipe=True) # initialize logging and model saving if self.args.output_dir is not None: self.logger = Logger( os.path.join(self.args.output_dir, 'train_log.json')) else: self.logger = Logger()
def __init__( self, n_states, n_actions, hidden_size=128, alr=2e-3, clr=2e-3, gamma=0.99, epochs=4, eps_clip=0.2, atype='cat'): self.gamma = gamma self.eps_clip = eps_clip self.epochs = epochs self.atype = atype self.memory = Memory() if atype == 'cat': self.actor = CatActor(n_states, n_actions, hidden_size) else: self.actor = NumActor(n_states, n_actions, hidden_size) self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=alr) self.critic = V_Critic(n_states, 1, hidden_size) self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=clr) self.MseLoss = torch.nn.MSELoss() self.step = 0
def __init__(self, path, date_format, start_date, positive_window_size, #manufacturer, \ disk_model, columns, features, label, forget_type, bl_delay=False, \ dropna=False, negative_window_size=6, validation_window=6, \ bl_regression=False, label_days=None, bl_transfer=False, bl_ssd=False): super().__init__() self.memory = Memory(path, start_date, positive_window_size, #manufacturer,\ disk_model, columns, features, label, forget_type, dropna, bl_delay, \ negative_window_size, bl_regression, label_days, bl_transfer, date_format, bl_ssd) if not bl_transfer: self.memory.buffering() self.data = self.memory.ret_df.drop(['model', 'date'], axis=1) else: self.data = self.memory.ret_df.drop(['model', 'date'], axis=1) self.data = self.data.reset_index(drop=True) self.class_name = label[0] self.num_classes = 2 self.bl_delay = bl_delay self.validation_window = validation_window
def __init__(self, paramsManager): self.paramsManager = paramsManager self.memory = Memory( self.paramsManager.get_params()["agent"]["GOOD_MEMORIES_SIZE"], self.paramsManager.get_params()["agent"]["BAD_MEMORIES_SIZE"], self.paramsManager.get_params()["agent"]["MINI_BATCH_SIZE"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_WIDTH"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_HEIGHT"], self.paramsManager.get_params()["environment"] ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"]) print("[i] Creating main convolutional neural network") self.main_cnn = CNN() print("[i] Creating target convolutional neural network") self.target_cnn = copy.deepcopy(self.main_cnn) print("[!] Creating the agent") self.main_cnn.cuda() self.target_cnn.cuda() self.agent = Agent( self.main_cnn, self.target_cnn, self.paramsManager.get_params()["agent"]["EPSILON_MAX"], self.paramsManager.get_params()["agent"] ["NUMBER_OF_FRAMES_WITH_CONSTANT_EPSILON"], self.paramsManager.get_params()["agent"]["FIRST_EPSILON_DECAY"], self.paramsManager.get_params()["agent"] ["FRAMES_TO_FIRST_EPSILON_DECAY"], self.paramsManager.get_params()["agent"]["FINAL_EPSILON_VALUE"], self.paramsManager.get_params()["agent"] ["FRAMES_TO_FINAL_EPSILON"], self.paramsManager.get_params()["agent"] ["EXPLORATION_PROBABILITY_DURING_EVALUATION"], self.paramsManager.get_params()["agent"]["LEARNING_RATE"]) self.breakout_wrapper = BreakoutWrapper( self.paramsManager.get_params()["environment"]["NAME"], self.paramsManager.get_params()["agent"]["NO_OP_STEPS"], self.paramsManager.get_params()["environment"] ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_WIDTH"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_HEIGHT"], self.paramsManager.get_params()["environment"]["RENDER"])
def sample_worker(self, pid, queue, min_batch_size): torch.randn(pid) if hasattr(self.env, 'np_random'): self.env.np_random.rand(pid) memory = Memory() logger = LoggerRL() while logger.num_steps < min_batch_size: state = self.env.reset() if self.running_state is not None: state = self.running_state(state) logger.start_episode(self.env) self.pre_episode() for t in range(10000): state_var = tensor(state).unsqueeze(0) vs_out = self.trans_policy(state_var) mean_action = self.mean_action or np.random.binomial( 1, 1 - self.noise_rate) action = self.policy_net.select_action(vs_out, mean_action)[0].numpy() action = int( action ) if self.policy_net.type == 'discrete' else action.astype( np.float64) next_state, env_reward, done, info = self.env.step(action) if self.running_state is not None: next_state = self.running_state(next_state) if self.custom_reward is not None: c_reward, c_info = self.custom_reward( self.env, state, action, info) reward = c_reward else: c_reward, c_info = 0.0, np.array([0.0]) reward = env_reward logger.step(self.env, env_reward, c_reward, c_info) mask = 0 if done else 1 exp = 1 - mean_action self.push_memory(memory, state, action, mask, next_state, reward, exp) if pid == 0 and self.render: self.env.render() if done: break state = next_state logger.end_episode(self.env) logger.end_sampling() if queue is not None: queue.put([pid, memory, logger]) else: return memory, logger
class Simulate(AbstractPredict): def __init__(self, path, date_format, start_date, positive_window_size, #manufacturer, \ disk_model, columns, features, label, forget_type, bl_delay=False, \ dropna=False, negative_window_size=6, validation_window=6, \ bl_regression=False, label_days=None, bl_transfer=False, bl_ssd=False): super().__init__() self.memory = Memory(path, start_date, positive_window_size, #manufacturer,\ disk_model, columns, features, label, forget_type, dropna, bl_delay, \ negative_window_size, bl_regression, label_days, bl_transfer, date_format, bl_ssd) if not bl_transfer: self.memory.buffering() self.data = self.memory.ret_df.drop(['model', 'date'], axis=1) else: self.data = self.memory.ret_df.drop(['model', 'date'], axis=1) self.data = self.data.reset_index(drop=True) self.class_name = label[0] self.num_classes = 2 self.bl_delay = bl_delay self.validation_window = validation_window def load(self): # Load Data from Memory class and backtracking delayed instances self.memory.data_management(self.keep_delay, self.bl_delay) self.data = self.memory.ret_df.drop(['model', 'date'], axis=1) self.data = self.data.reset_index(drop=True) def delay_evaluate(self): pop_sn = [] i = 0 for sn, instances in self.keep_delay.items(): instances.dequeue() if len(instances.queue) == 0: pop_sn.append(sn) i += 1 for sn in pop_sn: self.keep_delay.pop(sn) def run(self): self.inspect(self.data, self.class_name, self.num_classes, self.memory.new_inst_start_index, self.validation_window)
def perform_rollout(self, theta, inner=False): memory = Memory(self.hp) (s1, s2), _ = self.env.reset() for t in range(self.hp.len_rollout): a1, lp1 = self.act(s1, self.theta) a2, lp2 = self.act_opp(s2, theta) if self.id > 0: (s2, s1), (r2, r1), _, _ = self.env.step((a2, a1)) else: (s1, s2), (r1, r2), _, _ = self.env.step((a1, a2)) r1 = torch.Tensor(r1) r2 = torch.Tensor(r2) if inner: memory.add(lp2, lp1, r2) else: memory.add(lp1, lp2, r1) return memory
axis=0), mask) if done: print(step_no, 'epsiode reward- ' + str(eps_reward)) eps_reward = 0 state = env.reset() state_list = [] for i in range(args.past_frames - 1): state_list.append(initial[0]) state_list.append(state[0]) if len(memory) < args.batch_size: batch = memory.sample() else: batch = memory.sample(size=args.batch_size) update_dqn(model, batch, args, criterion, optimizer, device) step_no += 1 inp_channels = 4 env = gym.make(args.env_name) env = AtariRescale105x80(env) num_actions = env.action_space.n dqn = DQN(inp_channels, num_actions) memory = Memory(limit=args.replay_size) device = torch.device('cuda') dqn.to(device) train_model(dqn, env, memory, args, device)
def setup_memory(self) -> None: columns = [ "states", "next_states", "actions", "log_probs", "rewards", "done" ] self.episode_memory = Memory(columns) self.epoch_memory = Memory(columns)
class PolicyGradient(Agent): def __init__(self, env: Env, lr: float, gamma: float = 0.99, layers=(128, 128), verbose=False, model_path=None, save=False): super().__init__(env, verbose, save) self.gamma = gamma self.model_path = model_path if self.action_space.discrete: head = nn.Softmax(dim=-1) else: head = nn.Tanh() self.model = MLP(self.state_space.shape[0], self.action_space.shape[0], layers, head) self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.reset() def setup_memory(self) -> None: columns = ["states", "next_states", "actions", "log_probs", "rewards"] self.episode_memory = Memory(columns) self.epoch_memory = Memory(columns) def act(self, state: List, train: bool = True) -> Tuple: state = torch.from_numpy(state).type(torch.FloatTensor) action_probs = self.model(state) distribution = self.action_space.distribution(action_probs) action = distribution.sample() if train: return action.data.numpy(), distribution.log_prob(action) else: return torch.argmax(action_probs).data.numpy(), def update(self) -> None: self.optimizer.zero_grad() loss, = self.epoch_memory.get_columns(["loss"]) loss = torch.mean(torch.stack(loss)) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) self.optimizer.step() print(f"Value Loss: {loss.item()}") self.reset() def save_model(self) -> None: torch.save(self.model.state_dict(), self.model_path) def load_model(self, model_path: str) -> None: self.model.load_state_dict(torch.load(model_path)) self.model.eval() def setup_schedulers(self, n_epochs: int) -> None: scheduler = CosineAnnealingLR(self.optimizer, n_epochs) self.schedulers.append(scheduler) def cumulate_rewards(self) -> None: cumulated_reward = 0 cumulated_rewards = [] rewards, log_probs = self.episode_memory.get_columns( ["rewards", "log_probs"]) for i in range(len(rewards) - 1, -1, -1): cumulated_reward = self.gamma * cumulated_reward + rewards[i] cumulated_rewards.append(cumulated_reward) cumulated_rewards = cumulated_rewards[::-1] loss = -torch.sum( torch.mul(torch.stack(log_probs), torch.Tensor(cumulated_rewards))) self.episode_memory.append_column("loss", loss) self.episode_memory.extend_column("cumulated_rewards", cumulated_rewards)
def __init__(self): super().__init__() self.memory = Memory(1000)
class PPO: def __init__( self, n_states, n_actions, hidden_size=128, alr=2e-3, clr=2e-3, gamma=0.99, epochs=4, eps_clip=0.2, atype='cat'): self.gamma = gamma self.eps_clip = eps_clip self.epochs = epochs self.atype = atype self.memory = Memory() if atype == 'cat': self.actor = CatActor(n_states, n_actions, hidden_size) else: self.actor = NumActor(n_states, n_actions, hidden_size) self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=alr) self.critic = V_Critic(n_states, 1, hidden_size) self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=clr) self.MseLoss = torch.nn.MSELoss() self.step = 0 def store_transition(self, s, a, r, s_, done, p): self.memory.states.append(s) self.memory.rewards.append(r) self.memory.actions.append(a) self.memory.probs.append(p) self.memory.next_states.append(s_) self.memory.is_terminals.append(done) def update(self, batch_size=None): if len(self.memory.actions) < batch_size: return rewards = [] discounted_reward = 0 for reward, is_terminal in zip( self.memory.rewards[::-1], self.memory.is_terminals[::-1]): if is_terminal: discounted_reward = 0 discounted_reward = reward + self.gamma * discounted_reward rewards.insert(0, discounted_reward) rewards = torch.tensor(rewards).float() old_states = torch.FloatTensor(self.memory.states).detach() old_actions = torch.stack(self.memory.actions).detach() old_probs = torch.stack(self.memory.probs).detach() rewards = (rewards - rewards.mean()) / (rewards.std()+1e-7) split_res = self.memory.split(batch_size) for _ in range(self.epochs): for idxs in split_res: split_old_states = old_states[idxs[0]:idxs[1]] split_old_actions = old_actions[idxs[0]:idxs[1]] split_old_probs = old_probs[idxs[0]:idxs[1]] split_rewards = rewards[idxs[0]:idxs[1]] dist = self.actor.choose_action(split_old_states, True) log_probs = dist.log_prob(split_old_actions.squeeze()) # diff = log_probs.squeeze() - split_old_probs.squeeze() # ratios = torch.exp(log_probs.squeeze()) / torch.exp(split_old_probs.squeeze()) ratios = torch.exp(log_probs.squeeze() - split_old_probs.squeeze()) state_values = self.critic(split_old_states).squeeze() advantages = split_rewards - state_values.detach() surr1 = ratios * advantages surr2 = ratios.clamp(1-self.eps_clip, 1+self.eps_clip) * advantages aloss = -torch.min(surr1, surr2) - 0.01 * dist.entropy() self.actor_opt.zero_grad() aloss = aloss.mean() aloss.backward() clip_gradient(self.actor_opt, 0.1) writer.add_scalar('actor_loss', aloss.item(), self.step) self.actor_opt.step() closs = 0.5*self.MseLoss(split_rewards, state_values).mean() self.critic_opt.zero_grad() closs.backward() clip_gradient(self.critic_opt, 0.1) writer.add_scalar('critic_loss', closs.item(), self.step) self.critic_opt.step() self.step += 1
args = parser.parse_args() args.output = get_output_folder(args.output, args.env) args.use_cuda = USE_CUDA with open(args.output + "/parameters.txt", 'w') as file: for key, value in vars(args).items(): file.write("{} = {}\n".format(key, value)) # Environment env = gym.make(args.env) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = int(env.action_space.high[0]) # Memory memory = Memory(args.mem_size, state_dim, action_dim, args) # Algorithm drla = NASTD3(state_dim, action_dim, max_action, args) # Action noise a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma) # Logger fields = ["eval_score", "critic_loss", "actor_loss", "total_steps"] logger = Logger(args.output, fields) # Train ite = 0 K = 5000 total_steps = 0
def main(cfg): random.seed(cfg.exp.seed) np.random.seed(cfg.exp.seed) torch.manual_seed(cfg.exp.seed) torch.backends.cudnn.deterministic = cfg.exp.torch_deterministic # so that the environment automatically resets env = SyncVectorEnv([ lambda: RecordEpisodeStatistics(gym.make('CartPole-v1')) ]) actor, critic = Actor(), Critic() actor_optim = Adam(actor.parameters(), eps=1e-5, lr=cfg.params.actor_lr) critic_optim = Adam(critic.parameters(), eps=1e-5, lr=cfg.params.critic_lr) memory = Memory(mini_batch_size=cfg.params.mini_batch_size, batch_size=cfg.params.batch_size) obs = env.reset() global_rewards = [] NUM_UPDATES = (cfg.params.total_timesteps // cfg.params.batch_size) * cfg.params.epochs cur_timestep = 0 def calc_factor(cur_timestep: int) -> float: """Calculates the factor to be multiplied with the learning rate to update it.""" update_number = cur_timestep // cfg.params.batch_size total_updates = cfg.params.total_timesteps // cfg.params.batch_size fraction = 1.0 - update_number / total_updates return fraction actor_scheduler = LambdaLR(actor_optim, lr_lambda=calc_factor, verbose=True) critic_scheduler = LambdaLR(critic_optim, lr_lambda=calc_factor, verbose=True) while cur_timestep < cfg.params.total_timesteps: # keep playing the game obs = torch.as_tensor(obs, dtype=torch.float32) with torch.no_grad(): dist = actor(obs) action = dist.sample() log_prob = dist.log_prob(action) value = critic(obs) action = action.cpu().numpy() value = value.cpu().numpy() log_prob = log_prob.cpu().numpy() obs_, reward, done, info = env.step(action) if done[0]: tqdm.write(f'Reward: {info[0]["episode"]["r"]}, Avg Reward: {np.mean(global_rewards[-10:]):.3f}') global_rewards.append(info[0]['episode']['r']) wandb.log({'Avg_Reward': np.mean(global_rewards[-10:]), 'Reward': info[0]['episode']['r']}) memory.remember(obs.squeeze(0).cpu().numpy(), action.item(), log_prob.item(), reward.item(), done.item(), value.item()) obs = obs_ cur_timestep += 1 # if the current timestep is a multiple of the batch size, then we need to update the model if cur_timestep % cfg.params.batch_size == 0: for epoch in tqdm(range(cfg.params.epochs), desc=f'Num updates: {cfg.params.epochs * (cur_timestep // cfg.params.batch_size)} / {NUM_UPDATES}'): # sample a batch from memory of experiences old_states, old_actions, old_log_probs, old_rewards, old_dones, old_values, batch_indices = memory.sample() old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32) old_actions = torch.tensor(old_actions, dtype=torch.float32) advantage = calculate_advantage(old_rewards, old_values, old_dones, gae_gamma=cfg.params.gae_gamma, gae_lambda=cfg.params.gae_lambda) advantage = torch.tensor(advantage, dtype=torch.float32) old_rewards = torch.tensor(old_rewards, dtype=torch.float32) old_values = torch.tensor(old_values, dtype=torch.float32) # for each mini batch from batch, calculate advantage using GAE for mini_batch_index in batch_indices: # remember: Normalization of advantage is done on mini batch, not the entire batch advantage[mini_batch_index] = (advantage[mini_batch_index] - advantage[mini_batch_index].mean()) / (advantage[mini_batch_index].std() + 1e-8) dist = actor(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0)) # actions = dist.sample() log_probs = dist.log_prob(old_actions[mini_batch_index]).squeeze(0) entropy = dist.entropy().squeeze(0) log_ratio = log_probs - old_log_probs[mini_batch_index] ratio = torch.exp(log_ratio) with torch.no_grad(): # approx_kl = ((ratio-1)-log_ratio).mean() approx_kl = ((old_log_probs[mini_batch_index] - log_probs)**2).mean() wandb.log({'Approx_KL': approx_kl}) actor_loss = -torch.min( ratio * advantage[mini_batch_index], torch.clamp(ratio, 1 - cfg.params.actor_loss_clip, 1 + cfg.params.actor_loss_clip) * advantage[mini_batch_index] ).mean() values = critic(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0)).squeeze(-1) returns = old_values[mini_batch_index] + advantage[mini_batch_index] critic_loss = torch.max( (values - returns)**2, (old_values[mini_batch_index] + torch.clamp( values - old_values[mini_batch_index], -cfg.params.critic_loss_clip, cfg.params.critic_loss_clip ) - returns )**2 ).mean() # critic_loss = F.mse_loss(values, returns) wandb.log({'Actor_Loss': actor_loss.item(), 'Critic_Loss': critic_loss.item(), 'Entropy': entropy.mean().item()}) loss = actor_loss + 0.25 * critic_loss - 0.01 * entropy.mean() actor_optim.zero_grad() critic_optim.zero_grad() loss.backward() nn.utils.clip_grad_norm_(actor.parameters(), cfg.params.max_grad_norm) nn.utils.clip_grad_norm_(critic.parameters(), cfg.params.max_grad_norm) actor_optim.step() critic_optim.step() memory.reset() actor_scheduler.step(cur_timestep) critic_scheduler.step(cur_timestep) y_pred, y_true = old_values.cpu().numpy(), (old_values + advantage).cpu().numpy() var_y = np.var(y_true) explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y wandb.log({'Explained_Var': explained_var}) if cfg.exp.save_weights: torch.save(actor.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/actor.pth')) torch.save(critic.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/critic.pth'))
def train(): numAgent = 10 # multiple agents are running synchronously. # each agent has a different type with different properties. # Only one network is created, different agent gets their # own behavior according to the embedding input. numGame = 20 # multiple games running simultaneously. print('agent count:', numAgent) print('Env num:', numGame) env = {} for game in range(numGame): env[game] = miniDotaEnv(args, numAgent) # initialize the neural networks. # use a single network to share the knowledge. net = ac(args) if not args.cpuSimulation: net = net.to(device) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) net.load_state_dict(ckpt['net']) observations, lastDone = {}, {} for game in range(numGame): observations[game] = env[game].reset(0)[ 'observations'] # get initial state. lastDone[game] = [ False ] * 10 # to record whether game is done at the previous step. optimizer = optim.Adam(net.parameters(), lr=args.lr) for iteration in range(args.max_iter): # playing-training iteration. start = time.time() print() print('Start iteration %d ..' % iteration) if args.cpuSimulation: net = net.cpu() net.eval() # switch to evaluation mode. memory = [] for i in range(numGame): memory.append([Memory() for j in range(numAgent)]) # memory is cleared at every iter so only the current iteration's samples are used in training. # the separation of memory according to game is necessary as they # need to be processed separate for each game. steps = 0 teamscore = 0 # only for game 0. record = [] # record the states for visualization. gameEnd = np.zeros(numGame).astype(bool) while steps <= args.time_horizon: # loop for one game. if np.all(gameEnd): break steps += 1 stateList = [] for game in range(numGame): for agent in range(numAgent): stateList.append( np.expand_dims(observations[game][agent], axis=0)) stateCombined = np.concatenate(stateList, axis=0) # concatenate the states of all games and process them by the network together. with torch.no_grad(): actionDistr = net(to_tensor(stateCombined, args.cpuSimulation)) actions = get_action(actionDistr) for game in range(numGame): if not gameEnd[game]: # the following random action cannot work, because random action has too small prob density value, # leading to strange bugs. # sample = random.random() # if sample > args.randomActionRatio * (1 - min(1, iteration/1000) ): # thisGameAction = actions[10*game:10*(game+1), :] # contain actions from all agents. # check(thisGameAction) # else: # actionmove = np.random.randint(0, 3, size=(10,3)) # target = np.random.randint(0, 12, size=(10,1)) # thisGameAction = np.concatenate([actionmove, target], axis=1) thisGameAction = actions[10 * game:10 * ( game + 1 ), :] # select the actions from all agents of this env. envInfo = env[game].step( thisGameAction ) # environment runs one step given the action. nextObs = envInfo['observations'] # get the next state. if game == 0: record.append( np.concatenate([ env[game].getState(), actions[0:10, :].reshape(-1) ])) rewards = envInfo['rewards'] dones = envInfo['local_done'] # masks = list(~dones) # cut the return calculation at the done point. masks = [ True ] * numAgent # no need to mask out the last state-action pair, # because the last reward is useful to us. for i in range(numAgent): if not lastDone[game][i]: memory[game][i].push(observations[game][i], thisGameAction[i], rewards[i], masks[i]) lastDone[game] = dones if game == 0: teamscore += sum( [rewards[x] for x in env[game].getTeam0()]) observations[game] = nextObs gameEnd[game] = np.all(dones) if gameEnd[game]: if game == 0: print('Game 0 score: %f' % teamscore) # recordMat = np.stack(record)# stack will expand the dimension before concatenate. # draw(recordMat, iteration, env[game].getUnitRange(), 10) observations[game] = env[game].reset(iteration + 1)['observations'] lastDone[game] = [False] * 10 simEnd = time.time() print('Simulation time: %.f' % (simEnd - start)) net.train() # switch to training mode. net = net.cuda() sts, ats, returns, advants, old_policy, old_value = [], [], [], [], [], [] for game in range(numGame): for i in range(numAgent): batch = memory[game][i].sample() st, at, rt, adv, old_p, old_v = process_memory( net, batch, args) sts.append(st) ats.append(at) returns.append(rt) advants.append(adv) old_policy.append(old_p) old_value.append(old_v) sts = torch.cat(sts) ats = torch.cat(ats) returns = torch.cat(returns) advants = torch.cat(advants) old_policy = torch.cat(old_policy) old_value = torch.cat(old_value) train_model(net, optimizer, sts, ats, returns, advants, old_policy, old_value, args) # training is based on the state-action pairs from all games of the current iteration. trainEnd = time.time() print('Training time: %.f' % (trainEnd - simEnd)) if iteration % 10 == 0: model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_%.3f.pth.tar' % teamscore) save_checkpoint( { 'net': net.state_dict(), 'args': args, 'score': teamscore }, filename=ckpt_path)
class BreakOutPlayer: def __init__(self, paramsManager): self.paramsManager = paramsManager self.memory = Memory( self.paramsManager.get_params()["agent"]["GOOD_MEMORIES_SIZE"], self.paramsManager.get_params()["agent"]["BAD_MEMORIES_SIZE"], self.paramsManager.get_params()["agent"]["MINI_BATCH_SIZE"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_WIDTH"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_HEIGHT"], self.paramsManager.get_params()["environment"] ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"]) print("[i] Creating main convolutional neural network") self.main_cnn = CNN() print("[i] Creating target convolutional neural network") self.target_cnn = copy.deepcopy(self.main_cnn) print("[!] Creating the agent") self.main_cnn.cuda() self.target_cnn.cuda() self.agent = Agent( self.main_cnn, self.target_cnn, self.paramsManager.get_params()["agent"]["EPSILON_MAX"], self.paramsManager.get_params()["agent"] ["NUMBER_OF_FRAMES_WITH_CONSTANT_EPSILON"], self.paramsManager.get_params()["agent"]["FIRST_EPSILON_DECAY"], self.paramsManager.get_params()["agent"] ["FRAMES_TO_FIRST_EPSILON_DECAY"], self.paramsManager.get_params()["agent"]["FINAL_EPSILON_VALUE"], self.paramsManager.get_params()["agent"] ["FRAMES_TO_FINAL_EPSILON"], self.paramsManager.get_params()["agent"] ["EXPLORATION_PROBABILITY_DURING_EVALUATION"], self.paramsManager.get_params()["agent"]["LEARNING_RATE"]) self.breakout_wrapper = BreakoutWrapper( self.paramsManager.get_params()["environment"]["NAME"], self.paramsManager.get_params()["agent"]["NO_OP_STEPS"], self.paramsManager.get_params()["environment"] ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_WIDTH"], self.paramsManager.get_params()["environment"] ["FRAME_PROCESSED_HEIGHT"], self.paramsManager.get_params()["environment"]["RENDER"]) def train(self): frame_number = 0 rewards = [] # Stores the mean rewards of each epoch epochs_means = [] # While we are training while frame_number < self.paramsManager.get_params( )["agent"]["MAX_FRAMES"]: ######################### ####### TRAINING ######## ######################### # Epoch counter epoch_counter = 0 # Stores the epoch rewards epoch_rewards = [] # While we arent on evaluation while epoch_counter < self.paramsManager.get_params( )["agent"]["EVAL_FREQUENCY"]: # Resetting the env done_life_lost = self.breakout_wrapper.reset(evaluation=False) # Other params total_episode_reward = 0 current_ale_lives = 5 perform_fire = True for i in range(self.paramsManager.get_params()["agent"] ["MAX_EPISODE_LENGTH"]): # Prints the saparetor defined on the json print(self.paramsManager.get_params()["environment"] ["SEPARATOR"]) # If its necessary to FIRE if perform_fire: chosen_action = 1 else: chosen_action = self.agent.get_action( frame_number, self.breakout_wrapper.actual_state, evaluation=False) # We take the step. A dying penalty is added by the breakout_wrapper processed_new_frame, reward, done, done_life_lost, _, info = self.breakout_wrapper.step( chosen_action, self.paramsManager.get_params()["agent"] ["DYING_REWARD"], current_ale_lives) print("[i] Action performed: ", chosen_action, ". Reward: ", reward, ".Frame number: ", frame_number) # If we already have rewards: if len(rewards) != 0: print("[i] Mean Training Reward: %.3f" % (sum(rewards) / len(rewards))) if len(epoch_rewards) != 0: print("[i] Mean Epoch Reward: %.3f" % (sum(epoch_rewards) / len(epoch_rewards))) frame_number += 1 epoch_counter += 1 total_episode_reward += reward if self.paramsManager.get_params()["agent"]["CLIP_REWARD"]: self.memory.store(processed_new_frame, chosen_action, self.clip_reward(reward), done_life_lost) else: self.memory.store(processed_new_frame, chosen_action, reward, done_life_lost) # If its time to learn if frame_number % self.paramsManager.get_params()["agent"][ "UPDATE_FREQUENCY"] and frame_number > self.paramsManager.get_params( )["agent"]["REPLAY_MEMORY_START_FRAME"]: losses = self.agent.learn( self.memory, self.paramsManager.get_params()["agent"]["GAMMA"], self.paramsManager.get_params()["agent"] ["MINI_BATCH_SIZE"]) if frame_number % self.paramsManager.get_params()["agent"][ "NETWORK_UPDATE_FREQ"] == 0 and frame_number > self.paramsManager.get_params( )["agent"]["REPLAY_MEMORY_START_FRAME"]: self.agent.updateNetworks() if info["ale.lives"] < current_ale_lives: perform_fire = True current_ale_lives = info["ale.lives"] elif info["ale.lives"] == current_ale_lives: perform_fire = False if done: done = False perform_fire = True break rewards.append(total_episode_reward) epoch_rewards.append(total_episode_reward) ######################### ####### SAVE INFO ####### ######################### epochs_means.append(sum(epoch_rewards) / len(epoch_rewards)) file = open("results.txt", "w") print("============ EPOCH %d FINISHED ============" % len(epochs_means)) for idx, mean in enumerate(epochs_means): print("Epoch number: %d. Mean reward: %.3f" % (idx, mean)) file.write("Epoch number: %d. Mean reward: %.3f\n" % (idx, mean)) file.close() time.sleep(10) def clip_reward(self, r): if r > 0: return 1 elif r == 0: return 0 else: return -1
critic_layer_sizes, grayscale=grayscale).to(device) # Create AE Hashing model and optimizers ae_hash = AEHash(len_AE_hashcode, 4 if stacked else 1, noise_scale, saturating_weight, device=device).to(device) ae_hash_optim = optim.Adam(ae_hash.parameters()) # Create SimHash sim_hash = SimHash(len_AE_hashcode, len_SimHash_hashcode) # Set up memory memory = Memory(capacity, GAMMA, LAMBDA, 'cpu') # Put memory on cpu to save space # Set up pixel observation preprocessing transform = Compose([ ToPILImage(), Grayscale(num_output_channels=1), # Turn frame into grayscale Resize((52, 52)), ToTensor() ]) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [],
} for i in range(n_actor): agent.store_policy('Pendulum-v0', score=fs[i], index=i) n += 1 # printing iteration resume if debug: prPurple('Iteration#{}: Total steps:{} \n'.format(n, total_steps)) if __name__ == "__main__": # The environment env = gym.make("Pendulum-v0") state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = int(env.action_space.high[0]) # replay buffer memory = Memory(100000, state_dim, action_dim) # agent agent = DTD3(state_dim, action_dim, max_action, memory, n_actor=5) print("starting") train(agent, n_episodes=1000, max_steps=1000000, debug=True, n_eval=100, n_actor=5)
class ExperienceReplayDQNAgent(DQNAgent): def __init__(self): super().__init__() self.memory = Memory(1000) def remember(self, state, action, reward, next_state, done): self.memory.store((state, action, reward, next_state, done)) def replay_new(self, memory): idx, minibatch, ISWeights = memory.sample(1000) for sample in minibatch: state = sample[0][0] action = sample[0][1] reward = sample[0][2] next_state = sample[0][3] done = sample[0][4] target = reward if not done: target = reward + self.gamma * np.amax( self.model.predict(np.array([next_state]))[0]) target_f = self.model.predict(np.array([state])) memory.batch_update(idx, np.abs(target_f[0] - target)) target_f[0][np.argmax(action)] = target self.model.fit(np.array([state]), target_f, epochs=1, verbose=0) def train_short_memory(self, state, action, reward, next_state, done): target = reward if not done: target = reward + self.gamma * np.amax( self.model.predict(next_state.reshape((1, 11)))[0]) target_f = self.model.predict(state.reshape((1, 11))) target_f[0][np.argmax(action)] = target self.model.fit(state.reshape((1, 11)), target_f, epochs=1, verbose=0) def run(self, mode_file): pygame.init() counter_games = 0 score_plot = [] counter_plot = [] record = 0 while counter_games < 200: # Initialize classes game = Game(440, 440, mode_file) player1 = game.player food1 = game.food # Perform first move initialize_game(player1, game, food1, self) if game_settings['display_option']: display(player1, food1, game, record) while not game.crash: # agent.epsilon is set to give randomness to actions self.epsilon = 80 - counter_games # get old state state_old = self.get_state(game, player1, food1) # perform random actions based on agent.epsilon, or choose the action if randint(0, 200) < self.epsilon: final_move = to_categorical(randint(0, 2), num_classes=3) else: # predict action based on the old state prediction = self.model.predict(state_old.reshape((1, 11))) final_move = to_categorical(np.argmax(prediction[0]), num_classes=3) # perform new move and get new state player1.do_move(final_move, player1.x, player1.y, game, food1, self) state_new = self.get_state(game, player1, food1) # set reward for the new state reward = self.set_reward(player1, game.crash) # train short memory base on the new action and state self.train_short_memory(state_old, final_move, reward, state_new, game.crash) # store the new data into a long term memory self.remember(state_old, final_move, reward, state_new, game.crash) record = get_record(game.score, record) if game_settings['display_option']: display(player1, food1, game, record) pygame.time.wait(game_settings['speed']) self.replay_new(self.memory) counter_games += 1 print('Game', counter_games, ' Score:', game.score) score_plot.append(game.score) counter_plot.append(counter_games) self.model.save_weights('weights.hdf5') # from google.colab import files # files.download("weights.hdf5") plot_seaborn(counter_plot, score_plot)
class Trainer: def __init__(self, model, args): self.model = model self.args = args self.iteration = 0 self.memory = Memory() if self.args.override or not os.path.isdir( self.args.output_dir) or self.args.output_dir == 'tmp': mkdir(self.args.output_dir, wipe=True) # initialize logging and model saving if self.args.output_dir is not None: self.logger = Logger( os.path.join(self.args.output_dir, 'train_log.json')) else: self.logger = Logger() # a wrapper for model.forward to feed inputs as list and get outputs as a list def evaluate_model(self, inputs): output = self.model.get_model().forward(*inputs) return list(output) if isinstance(output, tuple) else [output] def train(self): # load after a forward call for dynamic models batched_data, _, _ = load_samples(self.model.get_loader(), self.model.cuda, self.args.batch_size) self.evaluate_model(batched_data) self.iteration = load(self.args.output_dir, self.model.get_model(), self.iteration, self.model.get_optimizer()) for i in range(self.iteration, self.iteration + self.args.iterations): #################### LOAD INPUTS ############################ # TODO, make separate timer class if more complex timings arise t0 = time.time() batched_data, batched_targets, sample_array = load_samples( self.model.get_loader(), self.model.cuda, self.args.batch_size) self.logger.set('timing.input_loading_time', time.time() - t0) ############################################################# #################### FORWARD ################################ t1 = time.time() outputs = self.evaluate_model(batched_data) self.logger.set('timing.foward_pass_time', time.time() - t1) ############################################################# #################### BACKWARD AND SGD ##################### t2 = time.time() loss = self.model.get_lossfn()(*(outputs + batched_targets)) self.model.get_optimizer().zero_grad() loss.backward() self.model.get_optimizer().step() self.logger.set('timing.loss_backward_update_time', time.time() - t2) ############################################################# #################### LOGGING, VIZ and SAVE ################### print 'iteration: {0} loss: {1}'.format(self.iteration, loss.data.item()) if self.args.compute_graph and i == self.iteration: compute_graph( loss, output_file=os.path.join(self.args.output_dir, self.args.compute_graph)) if self.iteration % self.args.save_iter == 0: save(self.model.get_model(), self.model.get_optimizer(), self.iteration, self.args.output_dir) self.logger.set('time', time.time()) self.logger.set('date', str(datetime.now())) self.logger.set('loss', loss.data.item()) self.logger.set('iteration', self.iteration) self.logger.set('resident_memory', str(self.memory.resident(scale='mB')) + 'mB') self.logger.dump_line() self.iteration += 1 if self.args.visualize_iter > 0 and self.iteration % self.args.visualize_iter == 0: Batcher.debatch_outputs(sample_array, outputs) map(lambda x: x.visualize({'title': random_str(5)}), sample_array) ImageVisualizer().dump_image( os.path.join( self.args.output_dir, 'visualizations_{0:08d}.svg'.format(self.iteration)))
class ActorCritic(Agent): def __init__(self, env: Env, policy_lr: float, value_lr: float, gamma: float = 0.99, value_iter=50, policy_layers=(128, 128), value_layers=(128, 128), verbose=False, save=True, policy_path=None, value_path=None): super().__init__(env, verbose, save) self.gamma = gamma if self.action_space.discrete: policy_head = nn.Softmax(dim=-1) else: policy_head = nn.Tanh() self.policy_path = policy_path self.value_path = value_path self.policy_model = MLP(self.state_space.shape[0], self.action_space.shape[0], policy_layers, policy_head) self.value_model = MLP(self.state_space.shape[0], 1, value_layers, None) self.policy_optimizer = optim.Adam(self.policy_model.parameters(), lr=policy_lr) self.value_optimizer = optim.Adam(self.value_model.parameters(), lr=value_lr) self.value_loss = nn.MSELoss() self.reset() self.counter = 0 self.value_iter = value_iter def setup_memory(self) -> None: columns = [ "states", "next_states", "actions", "log_probs", "rewards", "done" ] self.episode_memory = Memory(columns) self.epoch_memory = Memory(columns) def act(self, state: List, train=True) -> Tuple: state = torch.from_numpy(state).type(torch.FloatTensor) action_probs = self.policy_model(state) distribution = self.action_space.distribution(action_probs) action = distribution.sample() if train: return action.data.numpy(), distribution.log_prob(action) else: return torch.argmax(action_probs).data.numpy(), def update(self) -> None: states, next_states, rewards, cumulated_rewards, log_probs, done = self.epoch_memory.get_columns( [ "states", "next_states", "rewards", "cumulated_rewards", "log_probs", "done" ]) # Compute the advantge for the previous Value function with torch.no_grad(): advantages = torch.Tensor(rewards) + ( self.gamma * (1 - torch.tensor(done, dtype=int)) * self.value_model(torch.Tensor(next_states)).squeeze() - self.value_model(torch.Tensor(states)).squeeze()) # Train the value function a cetrain number of iterations for _ in range(int(self.value_iter) + 1): values = self.value_model(torch.Tensor(states)).squeeze() value_loss = self.value_loss(values, torch.Tensor(cumulated_rewards)) self.value_optimizer.zero_grad() value_loss.backward() torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1) self.value_optimizer.step() self.value_iter *= 0.95 print(f"Value Loss: {value_loss.item()}") # Compute the policy loss using th previous value function policy_loss = -torch.sum(torch.mul(torch.stack(log_probs), advantages)) / self.counter print(f"Policy Loss: {policy_loss.item()}") self.policy_optimizer.zero_grad() policy_loss.backward() torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1) self.policy_optimizer.step() self.reset() def save_model(self) -> None: torch.save(self.policy_model.state_dict(), self.policy_path) torch.save(self.value_model.state_dict(), self.value_path) def load_model(self, policy_path: str, value_path: str) -> None: self.policy_model.load_state_dict(torch.load(policy_path)) self.value_model.load_state_dict(torch.load(value_path)) self.policy_model.eval() self.value_model.eval() def setup_schedulers(self, n_epochs: int) -> None: policy_scheduler = ExponentialLR(self.policy_optimizer, 0.97) value_scheduler = ExponentialLR(self.value_optimizer, 0.97) self.schedulers.append(policy_scheduler) self.schedulers.append(value_scheduler) def cumulate_rewards(self) -> None: cumulated_reward = 0 cumulated_rewards = [] rewards, = self.episode_memory.get_columns(["rewards"]) for i in range(len(rewards) - 1, -1, -1): cumulated_reward = self.gamma * cumulated_reward + rewards[i] cumulated_rewards.append(cumulated_reward) self.episode_memory.extend_column("cumulated_rewards", cumulated_rewards[::-1])
plt.ion() # Create OpenAI gym environment env = gym.make(env_name) if is_unwrapped: env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes).to(device) # Policy network # Set up memory memory = Memory(capacity, GAMMA, LAMBDA, device) # Set up optimizer policynet_optimizer = optim.Adam(policy_net.parameters()) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [], "epoch mean rewards": [], "max reward achieved": 0, "past %d epochs mean reward" % num_avg_epoch: 0 }
def Game(max_ep_len=1000, num_frames=4): global exit_game global actions env = gym.make('CarRacing-v0') state_dim = env.observation_space.shape action_dim = env.action_space.shape print(f"State: {state_dim}") print(f"Action: {action_dim}") # set interrupts env.reset() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release # make global actions array actions = np.zeros(4, dtype=np.float32) # mem memory = Memory() memory.create(state_dim, action_dim) # logger ep_ret_log = [] # init environment obs, ep_ret, ep_len, epoch = env.reset(), 0, 0, 0 obs = np.expand_dims(obs, axis=0) state_stack = np.repeat(obs, num_frames, axis=0) print(state_stack.shape) print(state_stack.dtype) # main loop while exit_game == False: # render window env.render() # take action obs2, r, d, _ = env.step(actions[:3]) obs2 = np.expand_dims(obs2, axis=0) state_stack = np.append(state_stack[1:], obs2, axis=0) # statistics ep_ret += r ep_len += 1 # Ignore the 'done' signal d = False if ep_len == max_ep_len else d # store in memory memory.add(state_stack, np.array(actions[:3]), r, d) # End of episode if d or (ep_len == max_ep_len): print(f"Epoch: {epoch}, EpRet: {ep_ret}, EpLen: {ep_len}, ReplayBuff: {len(memory)}") # if exists statistical data if len(ep_ret_log) > 0: log = np.array(ep_ret_log) print("AvgEpRet:", log.mean()) print("StdEpRet:", log.std()) print("MaxEpRet:", log.max()) print("MinEpRet:", log.min()) print() ep_ret_log.append(ep_ret) obs, ep_ret, ep_len = env.reset(), 0, 0 obs = np.expand_dims(obs, axis=0) state_stack = np.repeat(obs, num_frames, axis=0) epoch += 1 print('\n') # save the dataset memory.save()
def main(): # initialize the game env = gym.make('Pendulum-v0').unwrapped print env.observation_space print env.observation_space.high print env.observation_space.low print env.action_space # import hyper parameters args = init_hyper_para() # random initialize critic network state_dim = env.reset().shape[0] action_dim = env.action_space.shape[0] # if we have the saved model, load it if os.path.exists( '/home/likang/PycharmProjects/myddpg/bin/Models/critic.ckpt'): critic_net = torch.load( '/home/likang/PycharmProjects/myddpg/bin/Models/critic.ckpt') else: # initialize the model critic_net = net.CriticNetwork( state_dim=state_dim, action_dim=action_dim).to( device) # need to init paras according to the gym game # random initialize actor network(also called policy network) if os.path.exists( '/home/likang/PycharmProjects/myddpg/bin/Models/actor.ckpt'): actor_net = torch.load( '/home/likang/PycharmProjects/myddpg/bin/Models/actor.ckpt') else: actor_net = net.ActorNetwork(state_dim=state_dim, action_dim=action_dim).to(device) # initialize optimizer_critic = opt.Adam(critic_net.parameters(), lr=0.001) optimizer_actor = opt.Adam(actor_net.parameters(), lr=0.001) # initialize target critic network which is the same of critic network target_critic_net = copy.deepcopy(critic_net) # initialize target actor network which is the same of actor network target_actor_net = copy.deepcopy(actor_net) # init the memory buffer memory = Memory(args.capacity) # initialize a random process N for action exploration ounoise = OUNoise(env.action_space.shape[0]) # init random process # enter circle of training process for ep in range(args.num_ep): print(["ep: ", ep]) # reset random process ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - ep) / args.exploration_end + args.final_noise_scale ounoise.reset() # initialize a state s1 state = env.reset() # 这里把state初始化成二维的tensor state = torch.tensor([state], dtype=torch.float32).to(device) for t in range(MAX_STEP): print(['time step: ', t]) # select a action according to actor network(also called policy network) action = actor_net.select_action(state, ounoise) # execute the action and get a new state s_i+i # get a reward from the environment next_state, reward, done, _ = env.step([action.item()]) # store the transition {s_i, a_i, r_i, s_i+1} into memory next_state = torch.tensor([next_state], device=device, dtype=torch.float32) reward = torch.tensor([[reward]], device=device, dtype=torch.float32) memory.push(state, action, reward, next_state) state = next_state # print([state, action, reward, next_state]) del action, reward, next_state # get a batch_size transitions. # (s_i, a_i, r_i, s_{i+1}) in Algorithm1 of DDPG transitions = memory.sample(args.batch_size) s1 = torch.cat([tran.state for tran in transitions]) s2 = torch.cat([tran.next_state for tran in transitions]) r1 = torch.cat([tran.reward for tran in transitions]) a1 = torch.cat([tran.action for tran in transitions]) update_critic_net(s1, s2, r1, a1, target_actor_net, target_critic_net, critic_net, optimizer_critic, args) # update actor policy network update_actor_net(s1, actor_net, critic_net, optimizer_actor) # update target critic network # theta^{Q'}, see algorithm1 of DDPG for target_param, source_param in zip( target_critic_net.parameters(), critic_net.parameters()): target_param.data.copy_(args.tau * source_param + (1 - args.tau) * target_param) # update target actor network # theta^{mu'}, see algorithm1 of DDPG for target_param, source_param in zip( target_actor_net.parameters(), actor_net.parameters()): target_param.data.copy_(args.tau * source_param + (1 - args.tau) * target_param) # show image plt.imshow(env.render('rgb_array')) time.sleep(0.001) # finish if done: break del transitions gc.collect() if ep % 10 == 0: # save model torch.save(critic_net, './Models/' + 'critic.ckpt') torch.save(actor_net, './Models/' + 'actor.ckpt')
# Create OpenAI gym environment env = gym.make(env_name) if is_unwrapped: env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes).to(device) # Policy network value_net = ValueNet(input_size).to(device) # Value network # Set up memory memory = Memory(capacity, GAMMA, LAMBDA, device) # Set up optimizer policynet_optimizer = optim.Adam(policy_net.parameters()) valuenet_optimizer = optim.Adam(value_net.parameters()) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [], "epoch mean rewards": [], "max reward achieved": 0, "past %d epochs mean reward" % num_avg_epoch: 0, "value net loss": []
print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) states = running_state(env_info.vector_observations) actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr, weight_decay=args.l2_rate) scores = [] score_avg = 0 for iter in range(args.max_iter): actor.eval(), critic.eval() memory = [Memory() for _ in range(num_agent)] steps = 0 score = 0 while steps < args.time_horizon: steps += 1 mu, std, _ = actor(to_tensor(states)) actions = get_action(mu, std) env_info = env.step(actions)[default_brain] next_states = running_state(env_info.vector_observations) rewards = env_info.rewards dones = env_info.local_done masks = list(~(np.array(dones)))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model. Two value net policy_net = PolicyNet(layer_sizes).to(device) # Policy network value_net_ex = ValueNet(input_size).to( device) # Value network for extrinsic reward value_net_in = ValueNet(input_size + 1 + output_size).to( device) # One additional input unit to indicate trajectory number # Set up optimizer valuenet_in_optimizer = optim.Adam(value_net_in.parameters()) valuenet_ex_optimizer = optim.Adam(value_net_ex.parameters()) # Set up memory memory = Memory(capacity, GAMMA, LAMBDA, device=device) # Define observation normalization function. Normalize state vector values to range [-1., 1.] def state_nomalize(s): # Obtain environment observation space limit high = env.observation_space.high low = env.observation_space.low return ((s - low) / (high - low)) * 2 - 1 # Create Hashing function simhash = SimHash(input_size, len_hashcode, preprocessor=state_nomalize if use_preprocessor else None)
def marl_test(config): experiment_name = config.setdefault("experiment_name", "") time_slots = config.setdefault("time_slots", 10000) simulations = config.setdefault("simulations", 3) memory_size = config.setdefault("memory_size", 1200) pretrain_length = config.setdefault("pretrain_length", 6) step_size = config.setdefault("step_size", 5) save_freq = config.setdefault("save_freq", 1000) save_results = config.setdefault("save_results", True) save_model = config.setdefault("save_model", False) load_model = config.setdefault("load_model", False) load_slot = config.setdefault("load_slot", 4999) training = config.setdefault("training", False) episode_interval = config.setdefault("episode_interval", 25) explore_step = config.setdefault("explore", 2000) greedy_step = config.setdefault("greedy", 20000) training_stop = config.setdefault("training_stop", 20000) # Stop the training after these time step. train_after_episode = config.setdefault("train_after_episode", False) # Train after each episode in stead of training after each time slot. global_reward_avg = config.setdefault("global_reward_avg", False) # Train after each episode in stead of training after each time slot. save_positions = config.setdefault("save_positions", False) # Train after each episode in stead of training after each time slot. enable_channel = config.setdefault("enable_channel", False) # Train after each episode in stead of training after each time slot. batch_size = config["RLAgent"]["batch_size"] ia_penalty_enable = config.setdefault("ia_penalty_enable", False) ia_averaging = config.setdefault("ia_averaging", False) for simulation in range(simulations): print("-=-=-=-=-=-=-=-=-=-=-= experiment_name: " + experiment_name + " SIMULATION " + str(simulation + 1) + " =-=-=-=-=-=-=-=-=-=-=-") # Initialize the env. env = TestEnv(**config["EnvironmentTest"]) if ia_penalty_enable: ia_penalty_threshold = config.setdefault("ia_penalty_threshold", 5) ia_penalty_value = config.setdefault("ia_penalty_value", -10) ia_penalty_counter = {} previous_actions = {} # store the previous taken action by the UE. num_users = env.get_total_users() for user in range(num_users): ia_penalty_counter[user] = 0 previous_actions[user] = -1 # Initialize the agen mainDRQN = DRQN(env, name=experiment_name, total_episodes=time_slots/episode_interval, **config["RLAgent"]) #mainDRQN = DeepRecurrentQNetwork(env=env, name=experiment_name, **config["RLAgent"]) if load_model: print("Load model DRQN time step " + str(load_slot)) save_dir = "save_model/" + "test/" mainDRQN.load_model(save_dir, load_slot) # this is experience replay buffer(deque) from which each batch will be sampled and fed to the neural network for training memory = Memory(max_size=memory_size) log_reward_slot = [] log_actions_slot = [] log_ia_slot = [] sum_ia_prev = 0 log_x_positions = [] start_time = time.time() episode = 0 # Used to update the greediness of the algorithm # cumulative reward cum_r = [0] cum_r_slots = [0] # cumulative collision cum_collision = [0] cum_collision_slots = [0] # this is our input buffer which will be used for predicting next Q-values history_input = deque(maxlen=step_size) # env.network.reset_ia() # to sample random actions for each user action = env.sample() #obs = env.step(action) obs, rews = env.my_step(action, 0) rews = list(rews) state = env.obtain_state(obs, action, rews) # reward = [i[1] for i in obs[:num_users]] num_users = env.get_total_users() num_channels = env.get_action_space() ############################################## for ii in range(pretrain_length*step_size*5): action = env.sample() if enable_channel: obs, reward = env.my_step_ch(action, 0) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] else: #obs, reward = env.my_step( # action, 0) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] obs, reward = env.my_step_design(action, 0) # obs is a list of tuple with [[(ACK,REW) for each user] ,CHANNEL_RESIDUAL_CAPACITY_VECTOR] next_state = env.obtain_state(obs, action, rews) #next_state = env.state_generator(action, obs) memory.add((state, action, rews, next_state)) state = next_state history_input.append(state) ############################################## # TODO: now load the positions env.load_saved_positions() for time_step in range(time_slots): #initializing action vector action = np.zeros([num_users], dtype=np.int32) #converting input historskyy into numpy array # TODO: enable below for lstm state_vector = np.array(history_input) # LSTM # state_vector = state # DQN for each_user in range(num_users): #action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, time_slot=time_step) if time_step < explore_step and not load_model: # and 0: action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode, policy="explore") elif time_step < greedy_step and not load_model: # and 0: action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode) else: action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode, policy="greedy") # taking action as predicted from the q values and receiving the observation from the envionment # obs = env.step(action) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] if save_positions: user_pos = env.get_x_pos() log_x_positions.append(user_pos) if enable_channel: obs, reward = env.my_step_ch(action, time_step) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] else: obs, reward = env.my_step(action, time_step) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] #obs, reward = env.my_step_design(action, time_step) # TODO: update the env topology after each step. log_actions_slot.append(action) ia = env.network.get_information_age(time_step) ia_sum = calculate_ia_penalty(ia) log_ia_slot.append(ia) if ia_averaging: # ia based penalty to the reward ia_penalty = 0 if ia_sum > sum_ia_prev: ia_penalty = -1 elif ia_sum < sum_ia_prev: ia_penalty = 1 sum_ia_prev = ia_sum # Generate next state from action and observation # next_state = env.state_generator(action, obs) used for DQN next_state = env.obtain_state(obs, action, reward, episode, mainDRQN.get_eps()) # print (next_state) # reward for all users given by environment #reward = [i[1] for i in obs[:num_users]] # calculating sum of rewards sum_r = np.sum(reward) #calculating cumulative reward cum_r.append(cum_r[-1] + sum_r) cum_r_slots.append(cum_r_slots[-1] + sum_r) #If NUM_CHANNELS = 2 , total possible reward = 2 , therefore collision = (2 - sum_r) or (NUM_CHANNELS - sum_r) collision = num_channels - sum_r #calculating cumulative collision cum_collision.append(cum_collision[-1] + collision) cum_collision_slots.append(cum_collision_slots[-1] + collision) ############################# # for co-operative policy we will give reward-sum to each user who have contributed # to play co-operatively and rest 0 # NOTE: I think, I do not need that part since I already use positive and negative reward. for i in range(len(reward)): # for each user we have this. #if reward[i] > 0: if ia_averaging: # add penalty based on the direction of the Information age. reward[i] += ia_penalty if ia_penalty_enable: if reward[i] < 1 and action[i] == previous_actions[i]: ia_penalty_counter[i] += 1 else: ia_penalty_counter[i] = 0 if ia_penalty_counter[i] > ia_penalty_threshold: reward[i] = ia_penalty_value previous_actions[i] = action[i] if global_reward_avg: reward[i] = reward[i] + sum_r/len(reward) # Add the average total reward to each UE. ############################# #reward = reward*2 # Add the average total reward to each UE. log_reward_slot.append(sum_r) # print (reward) # print("EPOCH " + str(time_step)) # add new experiences into the memory buffer as (state, action , reward , next_state) for training memory.add((state, action, reward, next_state)) state = next_state #add new experience to generate input-history sequence for next state history_input.append(state) # Start training. if not train_after_episode: if time_step < training_stop and training: #and not load_model: mainDRQN.train(memory, time_step) if time_step%(episode_interval) == episode_interval-1: print("Time step " + str(time_step) + " epsilon " + str(mainDRQN.get_eps()) + " cum Collison " + str(cum_collision[episode_interval]) + " sum reward " + str(cum_r[episode_interval]) + " total time " + str(time.time()-start_time) ) cum_r = [0] cum_collision = [0] episode += 1 # Updates the velocity of the vehicles if activated env.update_velocity() # ia = env.network.get_information_age(time_step) if train_after_episode and time_step > (batch_size+10) and training: mainDRQN.train(memory, time_step) if time_step%save_freq == save_freq-1: # Save the collisions if save_results: print("save results for timestep ", time_step + 1) save_dir = "save_results/" + "test/" save_dir = save_dir + experiment_name if not os.path.isdir(save_dir): os.makedirs(save_dir) # filename = save_dir + "/collisions" + "_" + str(time_step) +"_sim"+str(simulation) # np.save(filename, np.asarray(cum_collision_slots)) filename = save_dir + "/rewards" + "_sim"+str(simulation) np.save(filename, np.asarray(log_reward_slot)) filename = save_dir + "/actions" + "_sim"+str(simulation) np.save(filename, np.asarray(log_actions_slot)) # filename = save_dir + "/time_step" + "_" + str(time_step)+"_sim"+str(simulation) # np.save(filename, np.asarray(str(time.time()-start_time))) filename = save_dir + "/positions" + "_sim"+str(simulation) np.save(filename, np.asarray(log_x_positions)) #filename = save_dir + "/ia" + "_sim"+str(simulation) #np.save(filename, np.asarray(log_ia_slot)) #"_" + str(time_step)+ if save_model: print("save model for timestep ", time_step + 1) save_dir = "save_model/" + "test/" #save_dir = save_dir mainDRQN.save_model(save_dir, time_step,simulation)