def __init__(self, env, policy, league_mgr_addr, model_pool_addrs, learner_addr=None, unroll_length=32, update_model_freq=32, n_v=1, verbose=0, rwd_shape=True, log_interval_steps=51): super(PPOActor, self).__init__(league_mgr_addr, model_pool_addrs, learner_addr, verbose=verbose, log_interval_steps=log_interval_steps) logger.configure(dir=None, format_strs=['stdout']) logger.set_level(verbose) self.env = env self.env.reset() sp = self.env.observation_space.spaces[0] sp = spaces.Box(low=0, high=1, shape=sp.shape) self.obs_space = spaces.Tuple([sp] * 2) self.agents = [ PPOAgent(policy, ob_space, ac_space, n_v=n_v, scope_name=scope_name) for ob_space, ac_space, scope_name in zip( self.env.observation_space.spaces, self.env.action_space.spaces, ["self"] + _get_oppo_names(env)) ] self.env.close() self._learning_agent_id = 0 self._enable_push = learner_addr is not None self._update_model_freq = update_model_freq self._unroll_length = unroll_length self._gamma = 0.95 self._lam = 0.9 self._reward_weights = None self.n_v = n_v # reward/value length self.models = [None, None] self.rwd_shape = rwd_shape self.should_log_info = True # TODO(pengsun): make it an argument if self._enable_push: self._data_queue = Queue(unroll_length) self._push_thread = Thread(target=self._push_data, args=(self._data_queue, )) self._push_thread.daemon = True self._push_thread.start()
def __init__(self, league_mgr_addr, model_pool_addrs, learner_addr=None, verbose=0, log_interval_steps=51): ip, hostname = get_ip_hostname() self._actor_id = hostname + '@' + ip + ':' + str(uuid.uuid1())[:8] self._learner_id = None self._league_mgr_apis = LeagueMgrAPIs(league_mgr_addr) self._model_pool_apis = ModelPoolAPIs(model_pool_addrs) if learner_addr: self._learner_apis = LearnerAPIs(learner_addr) self._learner_id = self._learner_apis.request_learner_id() self._log_interval_steps = log_interval_steps logger.configure(dir=None, format_strs=['stdout']) logger.set_level(verbose) self.task = None self._steps = 0
def __init__(self, league_mgr_addr, model_pool_addrs, learner_ports, rm_size, batch_size, ob_space, ac_space, policy, gpu_id, policy_config={}, ent_coef=1e-2, distill_coef=1e-2, vf_coef=0.5, max_grad_norm=0.5, rwd_shape=False, pub_interval=500, log_interval=100, save_interval=0, total_timesteps=5e7, burn_in_timesteps=0, learner_id='', batch_worker_num=4, pull_worker_num=2, unroll_length=32, rollout_length=1, use_mixed_precision=False, use_sparse_as_dense=True, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-5, data_type=PGData, data_server_version='v1', decode=False, log_infos_interval=20, **kwargs): super(PGLearner, self).__init__(league_mgr_addr, model_pool_addrs, learner_ports, learner_id) self.LR = tf.placeholder(tf.float32, []) """Learning Rate""" self.CLIPRANGE = tf.placeholder(tf.float32, []) """Learning Rate Clip Range""" self.ep_loss_coef = {} """Coefficients for those losses from the endpoints. Override it in derived class.""" # TODO(pengsun): fix the policy_config default value self._init_const(total_timesteps, burn_in_timesteps, batch_size, unroll_length, rwd_shape, ent_coef, vf_coef, pub_interval, log_interval, save_interval, policy, distill_coef, policy_config, rollout_length) # allow_soft_placement=True can fix issue when some op cannot be defined on # GPUs for tf-1.8.0; tf-1.13.1 does not have this issue config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(gpu_id) self.sess = tf.Session(config=config) self.rank = hvd.rank() if has_hvd else 0 # Prepare dataset ds = data_type(ob_space, ac_space, self.n_v, use_lstm=self.rnn, hs_len=self.hs_len, distillation=self.distillation, version='v2') self._data_server = DataServer(self._pull_data, rm_size, unroll_length, batch_size, ds, gpu_id_list=(0, ), batch_worker_num=batch_worker_num, pull_worker_num=pull_worker_num, rollout_length=rollout_length, prefetch_buffer_size=2, version=data_server_version, decode=decode, log_infos_interval=log_infos_interval) # prepare net config net_config = policy.net_config_cls(ob_space, ac_space, **policy_config) net_config.clip_range = self.CLIPRANGE if rwd_shape: # make net_config.reward-shaping-weights a tf.placeholder so as to change # it during training. # NOTE: Assume there is reward_weights_shape in net_config # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking? reward_weights_shape = net_config.reward_weights_shape self.rwd_weights = tf.placeholder(tf.float32, reward_weights_shape) net_config.reward_weights = self.rwd_weights if hasattr(net_config, 'lam'): # make net_config.lambda-for-td-lambda a tf.placeholder so as to change it # during training. # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking? self.LAM = tf.placeholder(tf.float32, []) net_config.lam = self.LAM else: self.LAM = None # build the policy net with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope: pass def create_policy(inputs, nc): return policy.net_build_fun(inputs=inputs, nc=nc, scope=model_scope) device = '/gpu:{}'.format(0) with tf.device(device): input_data = self._data_server.input_datas[0] if 'use_xla' in policy_config and policy_config['use_xla']: try: # Use tensorflow's accerlated linear algebra compile method with tf.xla.experimental.jit_scope(True): model = create_policy(input_data, net_config) except: logger.log( "WARNING: using tf.xla requires tf version>=1.15.") model = create_policy(input_data, net_config) else: model = create_policy(input_data, net_config) loss, vf_loss, losses = self.build_loss(model, input_data) if has_hvd: self.losses = [hvd.allreduce(loss) for loss in losses] else: self.losses = list(losses) self.params = tf.trainable_variables(scope='model') self.params_vf = tf.trainable_variables(scope='model/vf') self.param_norm = tf.global_norm(self.params) self.trainer = tf.train.AdamOptimizer(learning_rate=self.LR, beta1=adam_beta1, beta2=adam_beta2, epsilon=adam_eps) self.burn_in_trainer = tf.train.AdamOptimizer( learning_rate=self.LR, epsilon=1e-5) # same as default and IL if use_mixed_precision: try: self.trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( self.trainer) self.burn_in_trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( self.burn_in_trainer) except: logger.warn( "using tf mixed_precision requires tf version>=1.15.") if has_hvd: self.trainer = hvd.DistributedOptimizer( self.trainer, sparse_as_dense=use_sparse_as_dense) self.burn_in_trainer = hvd.DistributedOptimizer( self.burn_in_trainer, sparse_as_dense=use_sparse_as_dense) grads_and_vars = self.trainer.compute_gradients(loss, self.params) grads_and_vars_vf = self.burn_in_trainer.compute_gradients( vf_loss, self.params_vf) clip_vars = model.vars.lstm_vars grads_and_vars, self.clip_grad_norm, self.nonclip_grad_norm = self.clip_grads_vars( grads_and_vars, clip_vars, max_grad_norm) grads_and_vars_vf, self.clip_grad_norm_vf, self.nonclip_grad_norm_vf = self.clip_grads_vars( grads_and_vars_vf, clip_vars, max_grad_norm) self._train_batch = self.trainer.apply_gradients(grads_and_vars) self._burn_in = self.burn_in_trainer.apply_gradients(grads_and_vars_vf) self.loss_endpoints_names = model.loss.loss_endpoints.keys() self._build_ops() if has_hvd: barrier_op = hvd.allreduce(tf.Variable(0.)) broadcast_op = hvd.broadcast_global_variables(0) tf.global_variables_initializer().run(session=self.sess) self.sess.graph.finalize() self.barrier = lambda: self.sess.run(barrier_op) if has_hvd else None self.broadcast = lambda: self.sess.run(broadcast_op ) if has_hvd else None self.broadcast() # logging stuff format_strs = (['stdout', 'log', 'tensorboard', 'csv'] if self.rank == 0 else ['stdout', 'log', 'tensorboard', 'csv']) logger.configure(dir='training_log/{}rank{}'.format( self._learner_id, self.rank), format_strs=format_strs)
def __init__(self, ports, gpu_id, replay_filelist, batch_size, min_train_sample_num, min_val_sample_num, rm_size, learning_rate, print_interval, checkpoint_interval, num_val_batches, replay_converter_type, policy, policy_config, converter_config=None, policy_config_type=None, model_pool_addrs=None, rollout_length=1, checkpoints_dir=None, restore_checkpoint_path=None, train_generator_worker_num=4, val_generator_worker_num=2, pull_worker_num=2, num_sgd_updates=int(1e30), repeat_training_task=False, unroll_length=32, pub_interval=50, max_clip_grad_norm=1, after_loading_init_scope=None, use_mixed_precision=False, use_sparse_as_dense=False, enable_validation=True, post_process_data=None): assert len(ports) == 2 self.use_hvd = has_hvd and hvd.size() > 1 self.rank = 0 if not self.use_hvd else hvd.rank() self.model_key = 'IL-model' self.pub_interval = pub_interval self.rnn = (False if 'use_lstm' not in policy_config else policy_config['use_lstm']) self.hs_len = None # overwrite it using the batch_size for training policy_config['batch_size'] = batch_size if self.rnn: assert model_pool_addrs is not None self._model_pool_apis = ModelPoolAPIs(model_pool_addrs) self._model_pool_apis.check_server_set_up() policy_config['rollout_len'] = rollout_length # infer hidden state length (size) if 'hs_len' in policy_config: self.hs_len = policy_config['hs_len'] elif 'nlstm' in policy_config: self.hs_len = 2 * policy_config['nlstm'] else: self.hs_len = 128 self.should_push_model = (self.rnn and self.rank == 0) use_gpu = (gpu_id >= 0) converter_config = {} if converter_config is None else converter_config train_replay_filelist, val_replay_filelist = _get_local_replays( replay_filelist) replay_converter = replay_converter_type(**converter_config) ob_space, ac_space = replay_converter.space.spaces if post_process_data is not None: ob_space, ac_space = post_process_data(ob_space, ac_space) self.data_pool = ImDataServer( ports=ports, train_replay_filelist=train_replay_filelist, val_replay_filelist=val_replay_filelist, batch_size=batch_size, min_train_sample_num=min_train_sample_num, min_val_sample_num=min_val_sample_num, ob_space=ob_space, ac_space=ac_space, train_generator_worker_num=train_generator_worker_num, val_generator_worker_num=val_generator_worker_num, pull_worker_num=pull_worker_num, rm_size=rm_size, repeat_training_task=repeat_training_task, unroll_length=unroll_length, rollout_length=rollout_length, lstm=self.rnn, hs_len=self.hs_len, use_gpu=use_gpu) self._enable_validation = enable_validation config = tf.ConfigProto(allow_soft_placement=True) if use_gpu: config.gpu_options.visible_device_list = str(gpu_id) config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) net_config = policy_config_type(ob_space, ac_space, **policy_config) net_config_val = deepcopy(net_config) with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope: pass def create_policy(inputs, nc): return policy(inputs=inputs, nc=nc, scope=model_scope) if hasattr(net_config, 'endpoints_verbosity'): # intentionally disables endpoints during training net_config.endpoints_verbosity = 0 device = '/gpu:0' if use_gpu else '/cpu:0' with tf.device(device): if 'use_xla' in policy_config and policy_config['use_xla']: try: # Use tensorflow's accerlated linear algebra compile method with tf.xla.experimental.jit_scope(True): model = create_policy(self.data_pool.train_batch_input, net_config) except: logger.log( "WARNING: using tf.xla requires tf version>=1.15.") model = create_policy(self.data_pool.train_batch_input, net_config) else: model = create_policy(self.data_pool.train_batch_input, net_config) model_val = create_policy(self.data_pool.val_batch_input, net_config_val) params = tf.trainable_variables(scope='model') param_norm = tf.global_norm(params) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5) if use_mixed_precision: try: optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) except: logger.warn( "using tf mixed_precision requires tf version>=1.15.") if self.use_hvd: optimizer = hvd.DistributedOptimizer( optimizer, sparse_as_dense=use_sparse_as_dense) barrier_op = hvd.allreduce(tf.Variable(0.)) self.barrier = lambda: self._sess.run(barrier_op) train_loss = tf.reduce_mean(model.loss.total_il_loss * self.data_pool.train_batch_weight) val_loss = tf.reduce_mean(model_val.loss.total_il_loss * self.data_pool.val_batch_weight) if hasattr(net_config, 'weight_decay') and not net_config.weight_decay: # None or 0.0 total_loss = train_loss else: total_loss = train_loss + model.loss.total_reg_loss grads_and_vars = optimizer.compute_gradients(total_loss, params) clip_vars = model.vars.lstm_vars clip_grads = [grad for grad, var in grads_and_vars if var in clip_vars] nonclip_grads_and_vars = [(grad, var) for grad, var in grads_and_vars if var not in clip_vars] if max_clip_grad_norm > 0: clip_grads, clip_grad_norm = tf.clip_by_global_norm( clip_grads, max_clip_grad_norm) else: clip_grad_norm = tf.global_norm(clip_grads) clip_grads_and_var = list(zip(clip_grads, clip_vars)) grads_and_vars = clip_grads_and_var + nonclip_grads_and_vars grad_norm = tf.global_norm(list(zip(*grads_and_vars))[0]) train_op = optimizer.apply_gradients(grads_and_vars) tf.global_variables_initializer().run(session=self._sess) self.new_params = [ tf.placeholder(p.dtype, shape=p.get_shape()) for p in params ] self.param_assign_ops = [ p.assign(new_p) for p, new_p in zip(params, self.new_params) ] opt_params = optimizer.variables() self.new_opt_params = [ tf.placeholder(p.dtype, shape=p.get_shape()) for p in opt_params ] self.opt_param_assign_ops = [ p.assign(new_p) for p, new_p in zip(opt_params, self.new_opt_params) ] def read_params(): return self._sess.run(params) def read_opt_params(): return self._sess.run(opt_params) def load_model(np_new_params): self._sess.run( self.param_assign_ops, feed_dict={ p: np_p for p, np_p in zip(self.new_params, np_new_params) }) def restore_optimizer(np_new_opt_params): self._sess.run( self.opt_param_assign_ops, feed_dict={ p: np_p for p, np_p in zip(self.new_opt_params, np_new_opt_params) }) def _train_step(): return self._sess.run([ train_loss_aggregated, *train_other_losses_aggregated, grad_norm, clip_grad_norm, param_norm, train_op ], {})[:-1] def _val_step(): # maximal_feat = [tf.reduce_max(tf.cast(x, tf.float32)) # for x in self.data_pool.val_batch_input.X] # print(self._sess.run(maximal_feat, {})) return self._sess.run([ val_loss_aggregated, *val_other_losses_aggregated, *endpoints_aggregated ], {}) self._saver = ChkptsFromSelf(read_params, load_model, self.model_key) if restore_checkpoint_path is not None: self._saver._restore_model_checkpoint(restore_checkpoint_path) if after_loading_init_scope is not None: var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=after_loading_init_scope) logger.log('perform after loading init for vars') for v in var_list: logger.log(v) tf.variables_initializer(var_list).run(session=self._sess) if self.use_hvd: hvd.broadcast_global_variables(0).run(session=self._sess) _allreduce = lambda x: x if not self.use_hvd else hvd.allreduce(x) train_loss_aggregated = _allreduce(train_loss) train_other_loss_names = model.loss.loss_endpoints.keys() train_other_losses_aggregated = [ _allreduce(tf.reduce_mean(l * self.data_pool.train_batch_weight)) for l in model.loss.loss_endpoints.values() ] val_loss_aggregated = _allreduce(val_loss) val_other_loss_names = model_val.loss.loss_endpoints.keys() val_other_losses_aggregated = [ _allreduce(tf.reduce_mean(l * self.data_pool.val_batch_weight)) for l in model_val.loss.loss_endpoints.values() ] endpoints_names = model_val.endpoints.keys() endpoints_aggregated = [ _allreduce(tf.reduce_mean(l)) for l in model_val.endpoints.values() ] self._sess.graph.finalize() self._total_samples = lambda: [ self.data_pool._num_train_samples, self.data_pool._num_val_samples ] self._train_log_names = (['loss'] + list(train_other_loss_names) + ['grad_norm', 'clip_grad_norm', 'param_norm']) self._val_log_names = (['loss'] + list(val_other_loss_names) + list(endpoints_names)) self._batch_size = batch_size self._train_step = _train_step self._val_step = _val_step self._print_interval = print_interval self._checkpoint_interval = checkpoint_interval self._num_val_batches = num_val_batches self._checkpoints_dir = checkpoints_dir if self.rank == 0 else None self._num_sgd_updates = num_sgd_updates self.load_model = load_model self.restore_optimizer = restore_optimizer self.read_params = read_params self.read_opt_params = read_opt_params format_strs = ['log', 'tensorboard', 'csv'] logger.configure(dir='training_log/rank{}'.format(self.rank), format_strs=['stdout'] + format_strs) with logger.scoped_configure(dir='validation_log/rank{}'.format( self.rank), format_strs=['stderr'] + format_strs): self.val_logger = logger.Logger.CURRENT
def __init__(self, port, model_pool_addrs, mutable_hyperparam_type, hyperparam_config_name=None, restore_checkpoint_dir=None, save_checkpoint_root=None, save_interval_secs=3600, game_mgr_type='tleague.game_mgr.game_mgrs.RandomGameMgr', game_mgr_config=None, mute_actor_msg=False, verbose=0, init_model_paths=None, save_learner_meta=False): super(LeagueMgr, self).__init__(port, model_pool_addrs, restore_checkpoint_dir, save_checkpoint_root, save_interval_secs, mute_actor_msg, save_learner_meta, verbose=verbose) logger.set_level(verbose) logger.configure(dir='league_log/', format_strs=['stdout', 'log']) self._game_mgr_type = game_mgr_type game_mgr_cls = import_module_or_data(game_mgr_type) logger.log('__init__: game_mgr_type: {}'.format(game_mgr_type)) game_mgr_config = game_mgr_config or {} game_mgr_config['pgn_file'] = (game_mgr_config.get('pgn_file', None) or 'example.pgn') game_mgr_config['verbose'] = (game_mgr_config.get('verbose', None) or verbose) logger.log('__init__: game_mgr_config: {}'.format(game_mgr_config)) self.game_mgr = game_mgr_cls(**game_mgr_config) logger.log( '__init__: hyperparam_mgr: {}, hyperparam_config: {}'.format( mutable_hyperparam_type, hyperparam_config_name)) self._hyper_mgr = HyperparamMgr(self._model_pool_apis, mutable_hyperparam_type, hyperparam_config_name) self.init_model_keys = [] if init_model_paths is not None: assert isinstance(init_model_paths, list) logger.log( '__init__: init_model from paths {}:'.format(init_model_paths)) for idx, key_path in enumerate(init_model_paths): im_key, model_path = key_path with open(model_path, 'rb') as f: model = pickle.load(f) if not im_key.startswith('None:'): key = 'None:' + im_key else: key = im_key if hasattr(model, 'key'): logger.log( '__init__: init_model key {} stored in its model ' 'has been renamed as {}'.format(model.key, key)) if hasattr(model, 'model'): model = model.model hyperparam = None # specify init_model's hyperparam if possible if 'lrn_id_list' in game_mgr_config: hyperparam = self._hyper_mgr._default_hyperparam( learner_id=game_mgr_config['lrn_id_list'][idx]) logger.log( '__init__: init model {} has been bound with ' 'hyperparam {}'.format(key, hyperparam)) t = time.strftime('%Y%m%d%H%M%S') self._model_pool_apis.push_model(model, hyperparam, key, t, t, t) f.close() logger.log( '__init__: done pushing {} to model pool'.format(key)) self.game_mgr.add_player(p=key, parent_p=None) logger.log( '__init__: done adding player {} to game mgr'.format(key)) self.init_model_keys.append(key) else: logger.log('__init__: init_model is None.')