memory = deque() steps = 0 scores = [] while steps < 2048: episodes += 1 state = env.reset() state = running_state(state) score = 0 for _ in range(10000): if args.render: env.render() steps += 1 mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std, not args.categorical)[0] next_state, reward, done, _ = env.step(action) next_state = running_state(next_state) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) score += reward state = next_state if done: break
memory = deque() steps = 0 scores = [] while steps < 2048: episodes += 1 state = env.reset() state = running_state(state) score = 0 for _ in range(10000): if args.render: env.render() steps += 1 mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) next_state = running_state(next_state) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) score += reward state = next_state if done: break
scores = [] score_avg = 0 for iter in range(args.max_iter): actor.eval(), critic.eval() memory = [Memory() for _ in range(num_agent)] steps = 0 score = 0 while steps < args.time_horizon: steps += 1 mu, std, _ = actor(to_tensor(states)) actions = get_action(mu, std) env_info = env.step(actions)[default_brain] next_states = running_state(env_info.vector_observations) rewards = env_info.rewards dones = env_info.local_done masks = list(~(np.array(dones))) for i in range(num_agent): memory[i].push(states[i], actions[i], rewards[i], masks[i]) score += rewards[0] states = next_states if dones[0]: scores.append(score)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
def test(interval, runs): print('Testing..') numAgent = 10 numGame = 1 assert numGame == 1 # needed. env = {0: miniDotaEnv(args, numAgent)} net = ac(args) if not args.cpuSimulation: net = net.to(device) saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) net.load_state_dict(ckpt['net']) net.eval() observations = {0: env[0].reset(0)['observations']} for iteration in range(runs): start = time.time() print() print('Start iteration %d ..' % iteration) if args.cpuSimulation: net = net.cpu() steps = 0 teamscore = 0 gameEnd = np.zeros(numGame).astype(bool) record = [] teamLabel = env[0].getState().reshape((12, 4))[:10, 0] while steps <= args.time_horizon: # loop for one round of games. if np.all(gameEnd): break steps += 1 stateList = [] for game in range(numGame): for agent in range(numAgent): stateList.append( np.expand_dims(observations[game][agent], axis=0)) stateCombined = np.concatenate(stateList, axis=0) with torch.no_grad(): actionDistr = net(to_tensor( stateCombined, args.cpuSimulation)) # calculate all envs together. actions = get_action(actionDistr) for game in range(numGame): if not gameEnd[game]: thisGameAction = actions[ 10 * game:10 * (game + 1), :] # contain actions from all agents. # for player in range(10): # if teamLabel[player] == 0 and steps < 100: # thisGameAction[player] = [0, 1, 1, 0] # ablation test. envInfo = env[game].step( thisGameAction ) # environment runs one step given the action. nextObs = envInfo['observations'] # get the next state. allAction = np.concatenate( [actionDistr[x] for x in range(1, 5)], axis=1) record.append( np.concatenate([ env[0].getState(), actions[0:10, :].reshape(-1), allAction.reshape(-1) ])) rewards = envInfo['rewards'] dones = envInfo['local_done'] teamscore += sum([rewards[x] for x in env[0].getTeam0()]) observations[game] = nextObs gameEnd[game] = np.all(dones) if gameEnd[game]: print('Team 0 score: %f' % teamscore) simEnd = time.time() print('Simulation time: %.f' % (simEnd - start)) recordMat = np.stack( record ) # stack will expand the dimension before concatenate. draw(recordMat, iteration, env[game].getUnitRange(), interval) observations[game] = env[game].reset(iteration + 1)['observations'] drawEnd = time.time() print('Drawing time: %.f' % (drawEnd - simEnd))
def train(): numAgent = 10 # multiple agents are running synchronously. # each agent has a different type with different properties. # Only one network is created, different agent gets their # own behavior according to the embedding input. numGame = 20 # multiple games running simultaneously. print('agent count:', numAgent) print('Env num:', numGame) env = {} for game in range(numGame): env[game] = miniDotaEnv(args, numAgent) # initialize the neural networks. # use a single network to share the knowledge. net = ac(args) if not args.cpuSimulation: net = net.to(device) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) net.load_state_dict(ckpt['net']) observations, lastDone = {}, {} for game in range(numGame): observations[game] = env[game].reset(0)[ 'observations'] # get initial state. lastDone[game] = [ False ] * 10 # to record whether game is done at the previous step. optimizer = optim.Adam(net.parameters(), lr=args.lr) for iteration in range(args.max_iter): # playing-training iteration. start = time.time() print() print('Start iteration %d ..' % iteration) if args.cpuSimulation: net = net.cpu() net.eval() # switch to evaluation mode. memory = [] for i in range(numGame): memory.append([Memory() for j in range(numAgent)]) # memory is cleared at every iter so only the current iteration's samples are used in training. # the separation of memory according to game is necessary as they # need to be processed separate for each game. steps = 0 teamscore = 0 # only for game 0. record = [] # record the states for visualization. gameEnd = np.zeros(numGame).astype(bool) while steps <= args.time_horizon: # loop for one game. if np.all(gameEnd): break steps += 1 stateList = [] for game in range(numGame): for agent in range(numAgent): stateList.append( np.expand_dims(observations[game][agent], axis=0)) stateCombined = np.concatenate(stateList, axis=0) # concatenate the states of all games and process them by the network together. with torch.no_grad(): actionDistr = net(to_tensor(stateCombined, args.cpuSimulation)) actions = get_action(actionDistr) for game in range(numGame): if not gameEnd[game]: # the following random action cannot work, because random action has too small prob density value, # leading to strange bugs. # sample = random.random() # if sample > args.randomActionRatio * (1 - min(1, iteration/1000) ): # thisGameAction = actions[10*game:10*(game+1), :] # contain actions from all agents. # check(thisGameAction) # else: # actionmove = np.random.randint(0, 3, size=(10,3)) # target = np.random.randint(0, 12, size=(10,1)) # thisGameAction = np.concatenate([actionmove, target], axis=1) thisGameAction = actions[10 * game:10 * ( game + 1 ), :] # select the actions from all agents of this env. envInfo = env[game].step( thisGameAction ) # environment runs one step given the action. nextObs = envInfo['observations'] # get the next state. if game == 0: record.append( np.concatenate([ env[game].getState(), actions[0:10, :].reshape(-1) ])) rewards = envInfo['rewards'] dones = envInfo['local_done'] # masks = list(~dones) # cut the return calculation at the done point. masks = [ True ] * numAgent # no need to mask out the last state-action pair, # because the last reward is useful to us. for i in range(numAgent): if not lastDone[game][i]: memory[game][i].push(observations[game][i], thisGameAction[i], rewards[i], masks[i]) lastDone[game] = dones if game == 0: teamscore += sum( [rewards[x] for x in env[game].getTeam0()]) observations[game] = nextObs gameEnd[game] = np.all(dones) if gameEnd[game]: if game == 0: print('Game 0 score: %f' % teamscore) # recordMat = np.stack(record)# stack will expand the dimension before concatenate. # draw(recordMat, iteration, env[game].getUnitRange(), 10) observations[game] = env[game].reset(iteration + 1)['observations'] lastDone[game] = [False] * 10 simEnd = time.time() print('Simulation time: %.f' % (simEnd - start)) net.train() # switch to training mode. net = net.cuda() sts, ats, returns, advants, old_policy, old_value = [], [], [], [], [], [] for game in range(numGame): for i in range(numAgent): batch = memory[game][i].sample() st, at, rt, adv, old_p, old_v = process_memory( net, batch, args) sts.append(st) ats.append(at) returns.append(rt) advants.append(adv) old_policy.append(old_p) old_value.append(old_v) sts = torch.cat(sts) ats = torch.cat(ats) returns = torch.cat(returns) advants = torch.cat(advants) old_policy = torch.cat(old_policy) old_value = torch.cat(old_value) train_model(net, optimizer, sts, ats, returns, advants, old_policy, old_value, args) # training is based on the state-action pairs from all games of the current iteration. trainEnd = time.time() print('Training time: %.f' % (trainEnd - simEnd)) if iteration % 10 == 0: model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_%.3f.pth.tar' % teamscore) save_checkpoint( { 'net': net.state_dict(), 'args': args, 'score': teamscore }, filename=ckpt_path)
else: assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar") actor.eval(), critic.eval() for episode in range(args.iter): state = env.reset() steps = 0 score = 0 for _ in range(500): env.render() # mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] print('mu, std :', mu, std) next_state, reward, done, _ = env.step(action2) print('1','next_state :', next_state) next_state = running_state(next_state) # ZFilter의 역할 : env 환경에 맞춰진 state 값을 반환 print('2','next_state :', next_state) state = next_state score += reward if done: print("{} cumulative reward: {}".format(episode, score)) break print(torch.Tensor(state).unsqueeze(0))