def __init__(self, sess, log_path, env_function, num_cpu, network="fullyconv", ar=True, lr=1e-4, optimizer="rmsprop", ent_coef=1e-3, vf_coef=1.0, max_grad_norm=0.5, nsteps=5, nstack=1, gamma=0.99): if optimizer == "adam": self.trainer = tf.train.AdamOptimizer elif optimizer == "rmsprop": self.trainer = tf.train.RMSPropOptimizer else: raise NotImplementedError network_func = None if network == "fullyconv": network_func = networks.FullyConvNet elif network == "atari": network_func = networks.AtariNet else: raise NotImplementedError self.sess = sess self.log_path = log_path self.num_cpu = num_cpu self.env_function = env_function self.init_lr = lr self.env = SubprocVecEnv([self.env_function(i) for i in range(1)]) self.model = Model(network_func=network_func, screen_space=self.env.screen_space, minimap_space=self.env.minimap_space, ns_space=self.env.ns_space, trainer=self.trainer, ar=ar, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) self.gamma = gamma self.nsteps = nsteps self.ar = ar if ar: self.step_func = self.step_policy_ar else: self.step_func = self.step_policy
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None, cloth_cfg_path=None, render_path=None, start_state_path=None): """Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. Daniel: the above docs from baselines seems out of date, ALL types go here? Also, we're adding arguments for the cloth env: the config path, the render path, and the starting state path (last one is optional for the cloth). """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank, cloth_cfg_path=None, render_path=None, start_state_path=None): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir, cloth_cfg_path=cloth_cfg_path, render_path=render_path, start_state_path=start_state_path, ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([ make_thunk( i + start_index, cloth_cfg_path=cloth_cfg_path, render_path=None, # Daniel: for now start_state_path=start_state_path) for i in range(num_env) ]) else: return DummyVecEnv([ make_thunk(start_index, cloth_cfg_path, render_path=render_path, start_state_path=start_state_path) ])
def main(): FLAGS(sys.argv) env = SubprocVecEnv(1, 'CollectMineralShards') env.reset() total_reward = 0 for _ in range(1000): marine = random.randrange(2) x = random.randrange(32) y = random.randrange(32) print('Move %d to (%d, %d)' % (marine, x, y)) move_action = construct_action(marine, x, y) # This controls the APM. for _ in range(7): obs, rs, dones, _, _, _, selected, screens = env.step( [move_action]) total_reward += rs # Querying the position m_pos = {} m_pos['0'], rs, dones = get_position(env, 0) total_reward += rs m_pos['1'], rs, dones = get_position(env, 1) total_reward += rs print(rs) print(dones) print('Total reward: ', total_reward) print(m_pos) env.close()
def main(): FLAGS(sys.argv) env = SubprocVecEnv(1, 'CollectMineralShards') env.reset() total_reward = 0 for _ in range(1000): marine = random.randrange(2) x = random.randrange(32) y = random.randrange(32) print('Move %d to (%d, %d)' % (marine, x, y)) move_action = construct_action(marine, x, y) # This controls the APM. for _ in range(7): obs, rs, dones, _, _, _, selected, screens = env.step([move_action]) total_reward += rs # Querying the position m_pos = {} m_pos['0'], rs, dones = get_position(env, 0) total_reward += rs m_pos['1'], rs, dones = get_position(env, 1) total_reward += rs print(rs) print(dones) print('Total reward: ', total_reward) print(m_pos) env.close()
def make_vec_envs(evaluation): def env_thunk(rank): return lambda: self.make_env(seed=int(seed), rank=rank, evaluation=evaluation, env_id=env_id) env_fns = [env_thunk(i) for i in range(num_processes)] use_dummy = len( env_fns) == 1 or sys.platform == "darwin" or synchronous return VecPyTorch( DummyVecEnv(env_fns, render=render ) if use_dummy else SubprocVecEnv(env_fns))
def make_vec_envs( self, num_processes, gamma, render, synchronous, env_id, add_timestep, seed, evaluation, time_limit, num_frame_stack=None, **env_args, ): envs = [ functools.partial( # thunk self.make_env, rank=i, env_id=env_id, add_timestep=add_timestep, seed=seed, evaluation=evaluation, time_limit=time_limit, evaluating=evaluation, **env_args, ) for i in range(num_processes) ] if len(envs) == 1 or sys.platform == "darwin" or synchronous: envs = DummyVecEnv(envs, render=render) else: envs = SubprocVecEnv(envs) # if ( # envs.observation_space.shape # and len(envs.observation_space.shape) == 1 # ): # if gamma is None: # envs = VecNormalize(envs, ret=False) # else: # envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack) # elif len(envs.observation_space.shape) == 3: # envs = VecPyTorchFrameStack(envs, 4, device) return envs
def build_env4gail(args, nenv): def make_env(): def _thunk(): env = gym.make(args.env_id) env.seed(args.seed) # to make the result more reproducibility env = Monitor(env, logger.get_dir(), allow_early_resets=True) return env return _thunk envs = [make_env() for i in range(nenv)] envs = SubprocVecEnv(envs) envs = VecNormalize(envs) return envs
def make_atari_env(env_id, num_threads, seed, frame_stack=4): game_lives = gym.make(env_id).unwrapped.ale.lives() game_lives = game_lives if game_lives != 0 else 1 def make_env(rank): def _thunk(): env = wrappers.make_atari(env_id) env.seed(seed + rank) return wrappers.wrap_deepmind(env) return _thunk np.random.seed(seed) env = SubprocVecEnv([make_env(i) for i in range(num_threads)]) env = VecFrameStack(env, frame_stack) return env, game_lives
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) if env_type == 'atari' else gym.make( env_id) env.seed(seed + 10000 * mpi_rank + rank if seed is not None else None) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)), allow_early_resets=True) if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs) elif reward_scale != 1: return RewardScaler(env, reward_scale) else: return env return _thunk set_global_seeds(seed) if num_env > 1: return SubprocVecEnv( [make_env(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_env(start_index)])
class Agent(object): def __init__(self, sess, log_path, env_function, num_cpu, network="fullyconv", ar=True, lr=1e-4, optimizer="rmsprop", ent_coef=1e-3, vf_coef=1.0, max_grad_norm=0.5, nsteps=5, nstack=1, gamma=0.99): if optimizer == "adam": self.trainer = tf.train.AdamOptimizer elif optimizer == "rmsprop": self.trainer = tf.train.RMSPropOptimizer else: raise NotImplementedError network_func = None if network == "fullyconv": network_func = networks.FullyConvNet elif network == "atari": network_func = networks.AtariNet else: raise NotImplementedError self.sess = sess self.log_path = log_path self.num_cpu = num_cpu self.env_function = env_function self.init_lr = lr self.env = SubprocVecEnv([self.env_function(i) for i in range(1)]) self.model = Model(network_func=network_func, screen_space=self.env.screen_space, minimap_space=self.env.minimap_space, ns_space=self.env.ns_space, trainer=self.trainer, ar=ar, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) self.gamma = gamma self.nsteps = nsteps self.ar = ar if ar: self.step_func = self.step_policy_ar else: self.step_func = self.step_policy def training_process(self, epoch, train_steps, epsilon): # set learning rate current_lr = self.init_lr * (0.98**epoch) current_lr = max(current_lr, self.init_lr / 2.) mb_screen, mb_minimap, mb_ns, mb_rewards, mb_actions, mb_available_actions, mb_pos, mb_values, mb_dones = [],[],[],[],[],[],[],[],[] mb_args, mb_args_used = dict(), dict() mb_use_spatial_actions = [] for act_type in actions.TYPES: mb_args[act_type.name] = [] mb_args_used[act_type.name] = [] mb_states = [] self.epsilon = epsilon self.remake_env(self.num_cpu) obs, info = self.env.reset() screen, minimap, ns, available_actions = obs["screen"], obs[ "minimap"], obs["ns"], info["available_actions"] states = None update_steps = 0 start_time = time.time() print("=== Training Epoch: ", epoch, ", Learning Rate: ", current_lr, " ===") # with tqdm(total=train_steps) as pbar: while (True): print_log = False if len(mb_screen) == self.nsteps - 1: print_log = True action, arg, value, state = self.step_func( screen, minimap, ns, states, print_log=print_log, epsilon=epsilon, available_actions=available_actions) # action, arg, value, state = self.step_epsilon(screen, minimap, ns, states, print_log=print_log, epsilon=epsilon, available_actions=available_actions) mb_screen.append(np.copy(screen)) mb_minimap.append(np.copy(minimap)) mb_ns.append(np.copy(ns)) mb_available_actions.append(np.copy(available_actions)) mb_actions.append(action) # for a in arg: for act_type in actions.TYPES: temp, temp_used = [], [] for a in arg: if a[act_type.name] != -1: temp.append(a[act_type.name]) temp_used.append(1.) else: temp.append(0) temp_used.append(0.) mb_args[act_type.name].append(temp) mb_args_used[act_type.name].append(temp_used) mb_values.append(value) mb_dones.append(info["last"]) next_obs, info = self.env.step(action, arg) ''' # This part seems useless. Check later. for idx, done in enumerate(info["last"]): if done: obs[idx] = obs[idx] * 0 ''' obs = next_obs mb_rewards.append(info["reward"]) screen, minimap, ns, available_actions = obs["screen"], obs[ "minimap"], obs["ns"], info["available_actions"] if len(mb_screen) == self.nsteps: mb_dones.append(info["last"]) mb_screen = np.asarray(mb_screen, dtype=np.float32).swapaxes( 1, 0).reshape((self.num_cpu * self.nsteps, ) + self.env.screen_space) mb_minimap = np.asarray(mb_minimap, dtype=np.float32).swapaxes( 1, 0).reshape((self.num_cpu * self.nsteps, ) + self.env.minimap_space) mb_ns = np.asarray(mb_ns, dtype=np.float32).swapaxes( 1, 0).reshape((self.num_cpu * self.nsteps, ) + self.env.ns_space) mb_available_actions = np.asarray( mb_available_actions, dtype=np.float32).swapaxes( 1, 0).reshape((self.num_cpu * self.nsteps, ) + (len(pysc2.lib.actions.FUNCTIONS), )) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) for act_type in actions.TYPES: mb_args[act_type.name] = np.asarray( mb_args[act_type.name], dtype=np.int32).swapaxes(1, 0) mb_args_used[act_type.name] = np.asarray( mb_args_used[act_type.name], dtype=np.float32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.value(screen, minimap, ns).tolist() for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() for act_type in actions.TYPES: mb_args[act_type.name] = mb_args[act_type.name].flatten() mb_args_used[act_type.name] = mb_args_used[ act_type.name].flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() self.train(current_lr, mb_screen, mb_minimap, mb_ns, mb_states, mb_rewards, mb_masks, mb_actions, mb_use_spatial_actions, mb_available_actions, mb_pos, mb_values, mb_args, mb_args_used) update_steps += 1 # pbar.update(1) mb_screen, mb_minimap, mb_ns, mb_rewards, mb_actions, mb_available_actions, mb_pos, mb_values, mb_dones = [],[],[],[],[],[],[],[],[] mb_args, mb_args_used = dict(), dict() for act_type in actions.TYPES: mb_args[act_type.name] = [] mb_args_used[act_type.name] = [] if update_steps == train_steps: break self.env.close() print("=== Took ", (time.time() - start_time), " seconds to finish ", train_steps, " updates.===") def evaluating_process(self, epoch, episodes): # Since the game lengths are different for each game, only use one thread when evaluating self.remake_env(1) rewards = [] for _ in range(episodes): episode_reward = [0] obs, info = self.env.reset() screen, minimap, ns, available_actions = obs["screen"], obs[ "minimap"], obs["ns"], info["available_actions"] states = None while True: action, arg, value, state = self.step_func( screen, minimap, ns, states, available_actions=available_actions) obs, info = self.env.step(action, arg) episode_reward = [ sum(x) for x in zip(episode_reward, info["reward"]) ] screen, minimap, ns, available_actions = obs["screen"], obs[ "minimap"], obs["ns"], info["available_actions"] if info["last"][0]: rewards.append(episode_reward) break rewards = [r for sublist in rewards for r in sublist] self.env.save_replay("%sreplay" % self.log_path, epoch) self.env.close() return rewards def step_policy_ar(self, screen, minimap, ns, *_args, **_kwargs): a_s, a_probs, v_s, args = self.sess.run( [ self.model.pi_selected, self.model.base_action_softmax, self.model.value, self.model.args_selected ], { self.model.screen: screen, self.model.minimap: minimap, self.model.ns: ns, self.model.act_mask: _kwargs["available_actions"] }) a_s = np.reshape(a_s, -1) a_probs = np.reshape(a_probs, (-1, self.env.base_action_count)) aid_s, args_s = [], [] for idx in range(len(a_s)): aid = a_s[idx] aid_s.append(aid) temp_args = dict() for k, v in args.items(): temp_args[k] = -1 for arg in actions.FUNCTIONS[aid].args: temp_args[arg.name] = np.reshape(args[arg.name], -1)[idx] args_s.append(temp_args) if _kwargs.get('print_log', False): if args_s[0]["screen"] != -1: print("AR action: ", aid_s[0], " action_prob: ", a_probs[0][aid_s[0]], " pos: (", args_s[0]["screen"] % self.env.screen_space[1], ",", args_s[0]["screen"] // self.env.screen_space[1], ") ", end='') else: print("AR action: ", aid_s[0], " action_prob: ", a_probs[0][aid_s[0]], end='') return aid_s, args_s, v_s, [] def step_policy(self, screen, minimap, ns, *_args, **_kwargs): a_s, v_s, args = self.sess.run( [ self.model.base_action_softmax, self.model.value, self.model.args ], { self.model.screen: screen, self.model.minimap: minimap, self.model.ns: ns }) available_actions = _kwargs["available_actions"] filtered_a = np.multiply(a_s, available_actions) filtered_a /= np.sum(filtered_a, axis=1, keepdims=True) aid_s, args_s = [], [] for idx in range(np.shape(filtered_a)[0]): aid = np.random.choice(len(filtered_a[idx, :]), p=filtered_a[idx, :]) aid_s.append(aid) temp_args = dict() # initialize all arguments to -1 for k, v in args.items(): temp_args[k] = -1 # only sample needed arguments for arg in actions.FUNCTIONS[aid].args: temp_args[arg.name] = np.random.choice(len( args[arg.name][idx]), p=args[arg.name][idx]) args_s.append(temp_args) if _kwargs.get('print_log', False): if args_s[0]["screen"] != -1: print("action: ", aid_s[0], " action_prob: ", filtered_a[0][aid_s[0]], " pos: (", args_s[0]["screen"] % self.env.screen_space[1], ",", args_s[0]["screen"] // self.env.screen_space[1], ") pos_prob: ", args["screen"][0][args_s[0]["screen"]], end='') else: print("action: ", aid_s[0], " action_prob: ", filtered_a[0][aid_s[0]], end='') return aid_s, args_s, v_s, [] ''' def step_policy(self, screen, minimap, ns, *_args, **_kwargs): a_s, v_s, pos_s = self.sess.run([self.model.policy_a, self.model.value, self.model.policy_pos], {self.model.screen: screen, self.model.minimap: minimap, self.model.ns: ns, self.model.AVAIL_ACTION: _kwargs["available_actions"]}) if _kwargs.get('print_log', False): print ("action: ", a_s[0], " pos: (", pos_s[0] % 64, ", ", pos_s[0] // 64, ")") # print (np.shape(a_s)) # input() return a_s, pos_s, v_s, [] ''' def step_epsilon(self, screen, minimap, ns, *_args, **_kwargs): a_s, v_s, args = self.sess.run( [ self.model.base_action_softmax, self.model.value, self.model.args ], { self.model.screen: screen, self.model.minimap: minimap, self.model.ns: ns }) available_actions = _kwargs["available_actions"] filtered_a = np.multiply(a_s, available_actions) filtered_a /= np.sum(filtered_a, axis=1, keepdims=True) aid_s, args_s = [], [] for idx in range(np.shape(filtered_a)[0]): aid = None if np.random.uniform() < self.epsilon: available_act_ids = np.nonzero( _kwargs["available_actions"][idx])[0] aid = np.random.choice(available_act_ids) # print ("Random action:", aid) else: aid = np.random.choice(len(filtered_a[idx, :]), p=filtered_a[idx, :]) aid_s.append(aid) temp_args = dict() # initialize all arguments to -1 for k, v in args.items(): temp_args[k] = -1 # only sample needed arguments for arg in actions.FUNCTIONS[aid].args: temp_args[arg.name] = np.random.choice(len( args[arg.name][idx]), p=args[arg.name][idx]) args_s.append(temp_args) if _kwargs.get('print_log', False): if args_s[0]["screen"] != -1: print("action: ", aid_s[0], " action_prob: ", filtered_a[0][aid_s[0]], " pos: (", args_s[0]["screen"] % self.env.screen_space[1], ",", args_s[0]["screen"] // self.env.screen_space[1], ") pos_prob: ", args["screen"][0][args_s[0]["screen"]], end='') else: print("action: ", aid_s[0], " action_prob: ", filtered_a[0][aid_s[0]], end='') return aid_s, args_s, v_s, [] def train(self, lr, screen, minimap, ns, states, rewards, masks, acts, use_spatial_actions, available_actions, pos, values, args, args_used): advs = rewards - values td_map = { self.model.screen: screen, self.model.minimap: minimap, self.model.ns: ns, self.model.acts: acts, # self.model.avail_actions: available_actions, self.model.act_mask: available_actions, self.model.advs: advs, self.model.rewards: rewards, self.model.lr: lr } for act_type in actions.TYPES: td_map[self.model.act_args[act_type.name]] = args[act_type.name] td_map[self.model.act_args_used[act_type.name]] = args_used[ act_type.name] _, pg_loss, neglogpac, entropy, vf_loss = self.sess.run([ self.model._train, self.model.pg_loss, self.model.neglogpac, self.model.entropy, self.model.vf_loss ], td_map) print(" pg_loss: ", pg_loss, " entropy: ", entropy, " vf_loss: ", vf_loss) def value(self, screen, minimap, ns, *_args, **_kwargs): v = self.sess.run( self.model.value, { self.model.screen: screen, self.model.minimap: minimap, self.model.ns: ns }) return v def save_model(self, epoch): ps = self.sess.run(self.model.params) # make_path(save_path) open("%s%d.checkpoint" % (self.log_path, epoch), "w+") joblib.dump(ps, "%s/%d.checkpoint" % (self.log_path, epoch)) def remake_env(self, num_cpu): self.env.close() self.env = SubprocVecEnv( [self.env_function(i) for i in range(num_cpu)])
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, visualize=True, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "deepq-4way"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32), visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): FLAGS(sys.argv) steps = 0 #Test steps print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if FLAGS.lr == 0: FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if FLAGS.algorithm == "deepq-4way": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "deepq": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "a2c": logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if FLAGS.algorithm == "deepq": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16)) # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting # We need this change because sc2 now requires specifying players. with sc2_env.SC2Env( map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)], step_mul=step_mul, visualize=True, agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) acts = deepq_nexus_wars.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) agent = random_agent.RandomAgent() run_loop.run_loop([agent], env, steps) acts[0].save("mineral_shards_x.pkl") acts[1].save("mineral_shards_y.pkl") elif FLAGS.algorithm == "deepq-4way": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env(map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif FLAGS.algorithm == "a2c": num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_cpu : %s" % FLAGS.num_cpu) print("lr : %s" % FLAGS.lr) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env("CollectMineralShards", step_mul=step_mul, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn(env, q_func=model, num_actions=64, lr=1e-3, max_timesteps=20000000, buffer_size=10000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "acktr"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 # def make_env(rank): # # env = sc2_env.SC2Env( # # "CollectMineralShards", # # step_mul=step_mul) # # return env # #env.seed(seed + rank) # def _thunk(): # env = sc2_env.SC2Env( # map_name=FLAGS.map, # step_mul=step_mul, # visualize=True) # #env.seed(seed + rank) # if logger.get_dir(): # env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) # return env # return _thunk # agents = [Agent() # for _ in range(num_cpu)] # # for agent in agents: # time.sleep(1) # agent.daemon = True # agent.start() # agent_controller = AgentController(agents) #set_global_seeds(seed) env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map) policy_fn = CnnPolicy acktr_disc.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_cpu, ent_coef=0.1, callback=acktr_callback)
def remake_env(self, num_cpu): self.env.close() self.env = SubprocVecEnv( [self.env_function(i) for i in range(num_cpu)])
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( #interface.feature_layer.resolution 和 interface.feature_layer.minimap_resolution feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32) # 16 16 # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32) # 16 16 ) with sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=step_mul, #推进的速度,通俗理解就是人类玩家的每秒的有效操作 visualize=True, # screen_size_px=(16, 16), # minimap_size_px=(16, 16)) as env: agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = deepq.models.cnn_to_mlp( #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) #卷积核数量,卷积核大小,步长 # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True) # 卷积核数量,卷积核大小,步长 act = deepq_mineral_shards.learn( #训练模型并保存 # act = deepq_ActSeparate.learn( #训练模型并保存 # act=deepq_actSeparateWith4Directions.learn( # act = deepq_actionGroup_4way.learn( # act = deep_DiffActInSameTime.learn( env, q_func=model, num_actions=4, #default 16 num_actions=256 3 4 lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_actSeparateWith4Directions_callback ) #deepq_callback; deepq_ActSeperate_callback ; deepq_actSeparateWith4Directions_callback deep_DiffActInSameTime_callback act.save( "mineral_shards.pkl" ) #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py elif (FLAGS.algorithm == "deepq-4way"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env( # map_name="CollectMineralShards", step_mul=step_mul, # screen_size_px=(32, 32), # minimap_size_px=(32, 32), save_replay_episodes=2, replay_dir="D:/StarCraft II/StarCraft II/video", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) # model = deepq.models.mlp(hiddens=[256,128,4]) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)