def run(): env = envs.make(args.env_name) flag_is_train = args.flag_is_train # flag_is_train = 1, 一个训练,一个使用; flag_is_train = 0, 两个都是在使用(根据train_agent状态决定输出谁的信息---flag_train_blue) flag_focus_blue = args.flag_focus_blue # flag_focus_blue = 1 时训练agent_blue; flag_train_blue = 0 时训练agent_red if flag_focus_blue: train_agent_name = 'blue' red_agent = DQN(env.state_dim, env.action_dim, is_train=False, scope='red') blue_agent = DQN(env.state_dim, env.action_dim, is_train=flag_is_train, scope='blue') alloc.check_scheme(blue_agent.is_train, red_agent.is_train, train_agent_name) run_AirCombat_selfPlay(env, blue_agent, red_agent, train_agent_name) else: train_agent_name = 'red' blue_agent = DQN(env.state_dim, env.action_dim, is_train=False, scope='blue') red_agent = DQN(env.state_dim, env.action_dim, is_train=flag_is_train, scope='red') alloc.check_scheme(blue_agent.is_train, red_agent.is_train, train_agent_name) run_AirCombat_selfPlay(env, red_agent, blue_agent, train_agent_name)
def __init__(self, n_actions, epsilon=1.0): # TODO: n_actions is NOT too big: only 11, but dqn and drqn has different values # TODO: attack, move left, move right, aml, amr, tl, tr, f, b, af, ab # TODO: hard-code weapon change strategy. F**k! self._dqn = DQN('deathmatch', n_actions, epsilon) # TODO: shouldn't be 2 ** n_actions self._drqn = DRQN('deathmatch', n_actions, epsilon)
def creat_n_agent(unit_list, is_train, scope, sess): agent_list = [] for unit in unit_list: new_agent = DQN(unit.state_dim, unit.action_dim, scope + str(unit.number), sess) agent.append(new_agent) return agent_list
def get_initial_policy_net(LINEAR_INPUT_SCALAR=8, KERNEL=5): env = gym.make('gvgai-zelda-lvl0-v0') init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape n_actions = env.action_space.n init_model = [ screen_height, screen_width, LINEAR_INPUT_SCALAR, KERNEL, n_actions ] policy_net = DQN(*init_model).to(device) return policy_net, init_model
def get_initial_policy_net(level='gvgai-zelda-lvl0-v0', LINEAR_INPUT_SCALAR=8, KERNEL=5, env_maker=None): if env_maker: env = env_maker(level) else: import gym_gvgai env = gym.make(level) device = find_device() init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape n_actions = env.action_space.n init_model = [ screen_height, screen_width, LINEAR_INPUT_SCALAR, KERNEL, n_actions ] policy_net = DQN(*init_model).to(device) return policy_net, init_model
def train(self): config = self.config torch.manual_seed(config["seed"]) env = UnityEnvWrapper(UnityEnvironment(file_name=config["env_path"])) env.reset() agent = DQN(config, env.state_dim, env.action_dim) # Epsilon parameters eps_start = config["eps_start"] eps_end = config["eps_end"] eps_decay = config["eps_decay"] scores = [] scores_window = deque(maxlen=100) eps = eps_start time_start = time.time() for i_ep in range(1, config["n_episodes"] + 1): state = env.reset() score = 0 while True: action = agent.act(state, eps) next_state, reward, done = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) scores.append(score) eps = max(eps_end, eps_decay * eps) mean_score = np.mean(scores_window) print(f'\rEpisode {i_ep}\tAverage Score: {mean_score:.2f}', end="") if i_ep % 100 == 0: print(f'\rEpisode {i_ep}\tAverage Score: {mean_score:.2f}') agent.save("saved_models/model") time_elapsed = time.time() - time_start() print(f"Traing took: {time_elapsed // 3600:4.3d} hours") return scores
def runner(env_name, memory_bank_size, batch_size, gamma, learning_rate, epsilon, epsilon_min, loss, n_episodes, ma_threshold, args): # Initialize environment env = gym.make(env_name) nS = env.observation_space.shape[0] nA = env.env.action_space.n # Initialize memory bank and model memory_bank = MemoryBank(memory_bank_size) if args.model == 'dqn_plain': model = DQNPlain(nS, nA, [64], gamma, learning_rate, epsilon, epsilon_min, loss) elif args.model == 'dqn': model = DQN(nS, nA, [64], gamma, learning_rate, epsilon, epsilon_min, 1000, loss) elif args.model == 'ddqn': model = DDQN(nS, nA, [64], gamma, learning_rate, epsilon, epsilon_min, 1000, loss) # Initialize logging variables reward_list = deque() current_index = 0 train_log = deque() for episode in range(n_episodes): state = env.reset() done = False steps = 0 total_reward = 0 while not done: action, e = model.take_action(state, episode) new_state, reward, done, info = env.step(action) memory_bank.add(state, action, reward, new_state, done) state = new_state if current_index > memory_bank_size: # Get minibatch minibatch = memory_bank.get_mini_batch(batch_size) # Train on minibatch model.train_minibatch(minibatch) steps += 1 total_reward += int(reward) current_index += 1 if (args.render_env == 'y') and (episode % args.render_freq == 0): env.render() reward_list.append(total_reward) moving_average = np.mean(reward_list) if len(reward_list) > 100: reward_list.popleft() train_log.append((episode, steps, e, total_reward, moving_average)) logger.info( 'Ep: {} | Steps: {} | epsilon: {:.3f} | reward: {} | moving average: {:.2f}' .format(episode, steps, e, total_reward, moving_average)) if moving_average > ma_threshold: break # Save log and model weights train_df = pd.DataFrame(data=list(train_log), columns=[ 'episode', 'steps', 'epsilon', 'total_reward', 'moving_average' ]) train_df.to_csv('./logs/{}_{}_log.csv'.format(env_name, args.model), index=False) # Save memory bank and weights memory_bank.save_memory('./logs/{}_memory_bank'.format(env_name)) model.save_model('./logs/{}_{}_weights'.format(env_name, args.model))
parser.add_argument('--task', type=str, default='pong') parser.add_argument('--render', action='store_true') parser.add_argument('--cpu', action='store_true') parser.add_argument('--evaluate', type=str, default=None) parser.add_argument('--resume', type=str, default=None, nargs=2) args = parser.parse_args() params = HPS[args.task] device = torch.device('cpu') if args.cpu else torch.device('cuda') env = make_env(params.env_name) obs_shape = env.observation_space.shape nb_actions = env.action_space.n if params.net_type == 'conv': net = DQN((params.frame_stack, *obs_shape), nb_actions) elif params.net_type == 'linear': net = DQNLinear(obs_shape, nb_actions) agent = DQNAgent(net=net, nb_actions=nb_actions, gamma=params.gamma, unroll_steps=params.unroll_steps, device=device) if args.evaluate: agent.net.load_state_dict(torch.load(args.evaluate)) env = make_env(params.env_name, episodic=False) evaluate(agent, env, render=args.render) exit() if args.resume:
from pyglet.window import key def key_press(k, mod): global restart global a if k == key.R: restart = True if k == key.UP: a = 0 if k == key.DOWN: a = 1 if k == key.LEFT: a = 2 if k == key.RIGHT: a = 3 env.render() env.viewer.window.on_key_press = key_press else: size = (args.dim + 2) * args.zoom model = DQN(size, size, batch_norm=True) model.load_state_dict(torch.load(args.filename)) policy = PurePolicy(model) try: while True: state = env.reset() total_reward = 0.0 steps = 0 restart = False while True: pyglet.clock.tick() if (policy is not None): state_ten = tensorize(state) a = policy.get(state_ten) state, r, done, info = env.step(a) total_reward += r
ep_scores.append(score) print("Scores: ", ep_scores) return np.mean(ep_scores) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--config", default="config.yml", help="Path to the config file.") parser.add_argument("--n", type=int, default=10, help="Number of times to evaluate model.") args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() config = load_config(args.config) path = "/Users/igor/Downloads/Banana.app" env = UnityEnvWrapper(UnityEnvironment(file_name=path)) env.reset() config["device"] = "cpu" agent = DQN(config, state_size=env.state_dim, action_size=env.action_dim) agent.load("saved_models/model", "cpu") eval_score = evaluate(env, agent, n_episodes=args.n) print(f"Eval score: {eval_score:5.3f}")
lvl = 7 env = gym.make(f'{game}-lvl{lvl}-v0') env.reset() device = find_device() init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape n_actions = env.action_space.n LINEAR_INPUT_SCALAR = 8 KERNEL = 5 init_model = [ screen_height, screen_width, LINEAR_INPUT_SCALAR, KERNEL, n_actions ] win_factor = 100 model = DQN(*init_model) model.load_state_dict(torch.load('saved_models/torch_model_0-1-1-1-1-1')) current_screen = get_screen(env, device) state = current_screen stop_after = 1000 sum_score = 0 won = 0 key_found = 0 for lvl in range(7, 8): level_name = f'{game}-lvl{lvl}-v0' print(level_name) env = gym.make(level_name)
def run(): env = envs.make(args.env_name) blue_agent = DQN(env.state_dim, env.action_dim, is_train=1, scope='blue') red_agent = DQN(env.state_dim, env.action_dim, is_train=1, scope='red') run_NFSP(env, blue_agent, red_agent)