def vis_data(off_data, off_label, def_data, def_label, outdir, start_idx=0): """ vis the dataset of transitions Args ---- """ idx_ = start_idx init_pos = [ np.array(off_data[idx_, 0, -1, 0, :]), np.array(off_data[idx_, 0, -1, 1:6, :], dtype=np.float), np.array(off_data[idx_, 0, -1, 6:11, :], dtype=np.float) ] env = gym.make('bball-pretrain-v0') env = BBallWrapper(env, if_clip=False, if_norm_obs=False, if_norm_act=False, init_mode=2, if_vis_visual_aid=True, if_vis_trajectory=False, init_positions=init_pos) env = gym.wrappers.Monitor(env, outdir, lambda unused_episode_number: True, force=False, resume=True) obs = env.reset() while True: # prevent from modification temp_off_label = np.array(off_label[idx_, 0]) temp_def_label = np.array(def_label[idx_, 0]) if idx_ == start_idx: # the env's velocity is zero, so we add the last velocity after env reset. last_vel = off_data[idx_, 0, -1, 1:6, :] - \ off_data[idx_, 0, -2, 1:6, :] temp_off_label[5:] += last_vel.reshape([ 10, ]) last_vel = def_data[idx_, 0, -1, 6:11, :] - \ def_data[idx_, 0, -2, 6:11, :] temp_def_label += last_vel.reshape([ 10, ]) # offense action = pack_action([temp_off_label[:3], temp_off_label[3:]], team='offense') obs, _, done, _ = env.step(action) if done: env.close() break # deffense action = pack_action(temp_def_label, team='defense') obs, _, done, _ = env.step(action) if done: env.close() break idx_ += 1
def collect_results(config, steps, ppo_policy, D, denormalize_observ, generated_amount=100): """ test policy - draw episode into mpeg video - collect episode with scores on each frame into .npz file (for out customized player) Args ----- config : object, providing configurations via attributes. vanilla_env : object, env steps : int, to name the file with number of iterations of Discriminator ppo_policy : object, policy to generate actions D : object, discriminator to judge realistic denormalize_observ : function, denorm the returned observation """ timer = time.time() # read condition length data_len = np.load('bball_strategies/data/FixedFPS5Length.npy') # data_len = np.load('bball_strategies/data/WGAN/all_model_results/length.npy') # env to testing vanilla_env = gym.make(config.env) vanilla_env = BBallWrapper(vanilla_env, data=h5py.File( 'bball_strategies/data/OrderedGAILTransitionData_Testing.hdf5', 'r'), init_mode=1, fps=config.FPS, time_limit=np.max(data_len)-2) vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'collect_result/video/'), video_callable=lambda _: True, # init from dataset init_mode=1) total_output = [] index_list = [] for i in range(generated_amount): print('generating # {} episode'.format(i)) numpy_collector = [] act_collector = [] vanilla_obs = vanilla_env.reset() for _ in range(vanilla_env.time_limit): vanilla_act = ppo_policy.act( np.array(vanilla_obs)[None, None], stochastic=False) act_collector.append(vanilla_act.reshape([5, 2])) vanilla_trans_act = [ # Discrete(3) must be int int(0), # Box(2,) np.array([0.0, 0.0], dtype=np.float32), # Box(5, 2) np.zeros(shape=[5, 2], dtype=np.float32), # Box(5, 2) np.reshape(vanilla_act, [5, 2]) ] vanilla_obs, _, _, info = vanilla_env.step( vanilla_trans_act) numpy_collector.append(vanilla_obs) index_list.append(info['data_idx']) numpy_collector = np.array(numpy_collector) act_collector = np.array(act_collector) numpy_collector = denormalize_observ(numpy_collector) total_output.append(numpy_collector) total_output = np.array(total_output) # save numpy np.save(os.path.join(config.logdir, 'collect_result/total_output.npy'), total_output) np.save(os.path.join(config.logdir, 'collect_result/total_output_length.npy'), data_len[index_list]-2) print('collect_results time cost: {} per episode'.format( (time.time() - timer)/generated_amount)) vanilla_env.close()
def vis_result(sess, model, off_data, off_label, def_data, def_label, outdir, num_video): """ vis the results by using the pretrain output interacting with env Args ---- """ data_len = np.load('bball_strategies/data/FixedFPS5Length.npy') accumulator = 0 for i, v in enumerate(data_len): data_len[i] += accumulator accumulator += v for i in range(num_video): start_idx = data_len[i] idx_ = start_idx init_pos = [ np.array(off_data[idx_, 0, -1, 0, :]), np.array(off_data[idx_, 0, -1, 1:6, :], dtype=np.float), np.array(off_data[idx_, 0, -1, 6:11, :], dtype=np.float) ] env = gym.make('bball-pretrain-v0') env = BBallWrapper(env, if_clip=False, if_norm_obs=False, if_norm_act=False, init_mode=2, if_vis_visual_aid=True, if_vis_trajectory=False, init_positions=init_pos) env = gym.wrappers.Monitor(env, outdir, lambda unused_episode_number: True, force=False, resume=True) obs = env.reset() while True: # prevent from modification temp_off_label = np.array(off_label[idx_, 0]) temp_def_label = np.array(def_label[idx_, 0]) if idx_ == start_idx: # the env's velocity is zero, so we add the last velocity after env reset. last_vel = off_data[idx_, 0, -1, 1:6, :] - \ off_data[idx_, 0, -2, 1:6, :] temp_off_label[5:] += last_vel.reshape([ 10, ]) last_vel = def_data[idx_, 0, -1, 6:11, :] - \ def_data[idx_, 0, -2, 6:11, :] temp_def_label += last_vel.reshape([ 10, ]) if FLAGS.config == 'offense': # offense turn obs = norm_obs(env, obs) logits, actions = model.perform(sess, obs[None, None]) actions = pack_action([logits[0, 0], actions[0, 0]], FLAGS.config) obs, _, done, _ = env.step(actions) if done: env.close() break # defense turn actions = pack_action(temp_def_label, team='defense') obs, _, done, _ = env.step(actions) if done: env.close() break elif FLAGS.config == 'defense': # offense turn actions = pack_action([temp_off_label[:3], temp_off_label[3:]], team='offense') obs, _, done, _ = env.step(actions) if done: env.close() break # defense turn obs = norm_obs(env, obs) actions = model.perform(sess, obs[None, None]) actions = pack_action(actions, FLAGS.config) obs, _, done, _ = env.step(actions) if done: env.close() break idx_ += 1
def train(config, env_processes, outdir): """ Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() # env to get config dummy_env = gym.make(config.env) def normalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0 return observ def normalize_action(act): min_ = dummy_env.action_space[3].low max_ = dummy_env.action_space[3].high act = 2.0 * (act - min_) / (max_ - min_) - 1.0 return act def denormalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_ return observ # env to testing vanilla_env = gym.make(config.env) vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False, time_limit=50) vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_{}/'.format(config.train_len)), if_back_real=False, video_callable=lambda _: True, # init from dataset init_mode=1) # if not os.path.exists(os.path.join(config.logdir, 'gail_testing')): # os.makedirs(os.path.join(config.logdir, 'gail_testing')) vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy') # env to generate fake state env = gym.make(config.env) env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real, time_limit=config.max_length) env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real, # init from dataset in order init_mode=3) # Discriminator graph with tf.device('/gpu:0'): D = Discriminator(config, dummy_env) # PPO graph if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Agent to genrate acttion ppo_policy = PPOPolicy(config, env) # Data all_data = h5py.File( 'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r') expert_data, valid_expert_data = np.split( all_data['OBS'].value, [all_data['OBS'].value.shape[0]*9//10]) expert_action, valid_expert_action = np.split( all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0]*9//10]) print('expert_data', expert_data.shape) print('valid_expert_data', valid_expert_data.shape) print('expert_action', expert_action.shape) print('valid_expert_action', valid_expert_action.shape) # TF Session # TODO _num_finished_episodes => Variable:0 saver = utility.define_saver( exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, resume=FLAGS.resume) # NOTE reset variables in optimizer D.reset_optimizer(sess) # reset PPO optimizer opt_reset = tf.group( [v.initializer for v in graph.algo._optimizer.variables()]) sess.run(opt_reset) # visulization stuff if FLAGS.tally_only: tally_reward_line_chart(config, sess.run( D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) exit() # GAIL cumulate_steps = sess.run(graph.step) episode_idx = 0 valid_episode_idx = 0 while True: if episode_idx > (expert_data.shape[0]-config.episodes_per_batch*config.train_d_per_ppo) or episode_idx == 0: episode_idx = 0 perm_idx = np.random.permutation(expert_data.shape[0]) expert_data = expert_data[perm_idx] expert_action = expert_action[perm_idx] if valid_episode_idx > (valid_expert_data.shape[0]-config.episodes_per_batch) or valid_episode_idx == 0: valid_episode_idx = 0 valid_perm_idx = np.random.permutation( valid_expert_data.shape[0]) valid_expert_data = valid_expert_data[valid_perm_idx] valid_expert_action = valid_expert_action[valid_perm_idx] # testing if valid_episode_idx % (100 * config.episodes_per_batch) == 0: test_policy(config, vanilla_env, sess.run(D._global_steps), ppo_policy, D, denormalize_observ) if valid_episode_idx % (1000 * config.episodes_per_batch) == 0: tally_reward_line_chart(config, sess.run( D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) # train Discriminator train_Discriminator( episode_idx, config, expert_data, expert_action, env, ppo_policy, D, normalize_observ, normalize_action) if valid_episode_idx % (1000 * config.episodes_per_batch) == 0: tally_reward_line_chart(config, sess.run( D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) # valid Discriminator valid_Discriminator( valid_episode_idx, config, valid_expert_data, valid_expert_action, env, ppo_policy, D, normalize_observ, normalize_action) episode_idx += config.episodes_per_batch*config.train_d_per_ppo valid_episode_idx += config.episodes_per_batch # train PPO print('train PPO') cumulate_steps += total_steps for score in loop.run(sess, saver, cumulate_steps): yield score batch_env.close() vanilla_env.close() env.close()
def train(config, env_processes, outdir): """ Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() # env to get config dummy_env = gym.make(config.env) def normalize_observ(observ): min_ = dummy_env.observation_space.low[0, 0] max_ = dummy_env.observation_space.high[0, 0] observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0 return observ def normalize_action(act): min_ = dummy_env.action_space[3].low max_ = dummy_env.action_space[3].high act = 2.0 * (act - min_) / (max_ - min_) - 1.0 return act def denormalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_ return observ # env to testing vanilla_env = gym.make(config.env) vanilla_env = BBallWrapper(vanilla_env, data=h5py.File('bball_strategies/data/OrderedGAILTransitionData_522.hdf5', 'r'), init_mode=1, fps=config.FPS, time_limit=50) vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.max_length, config.D_len)), video_callable=lambda _: True, # init from dataset init_mode=1) # PPO graph if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Agent to genrate acttion ppo_policy = PPOPolicy(config, dummy_env) # summary writer of Discriminator summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator') # TF Session # NOTE: _num_finished_episodes => Variable:0 saver = utility.define_saver( exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, checkpoint=FLAGS.checkpoint, resume=FLAGS.resume) # NOTE reset variables in optimizer for different stages of curriculum learning opt_reset_D = tf.group( [v.initializer for v in graph.algo.D.optimizer.variables()]) # reset PPO optimizer opt_reset = tf.group( [v.initializer for v in graph.algo._optimizer.variables()]) sess.run([opt_reset, opt_reset_D]) # visulization stuff if FLAGS.tally_only: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action) tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action, stochastic=True) exit() # GAIL cumulate_steps = sess.run(graph.step) counter = 0 while True: # train Discriminator gail_timer = time.time() if counter > config.pretrain_d_times: num_d_to_train = config.train_d_per_ppo else: num_d_to_train = config.pretrain_d_per_ppo for _ in range(num_d_to_train): # train D feed_dict = { graph.is_training: True, graph.should_log: True, graph.do_report: True, graph.force_reset: False} gail_counter = 0 while gail_counter < config.gail_steps: gail_summary = sess.run( graph.gail_summary, feed_dict=feed_dict) if gail_summary: summary_writer.add_summary( gail_summary, global_step=sess.run(graph.algo.D._steps)) gail_counter += 1 # testing if counter % (config.vis_testing_freq) == 0: test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ) if counter % (config.tally_line_chart_freq) == 0: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action) tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action, stochastic=True) counter += 1 print('Time Cost of Discriminator per Update: {}'.format( (time.time() - gail_timer) / num_d_to_train)) # train ppo cumulate_steps += total_steps for score in loop.run(sess, saver, cumulate_steps): yield score batch_env.close() vanilla_env.close()
def train(config, env_processes, outdir): """ Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args ---- config : Object providing configurations via attributes. env_processes : Whether to step environment in external processes. outdir : Directory path to save rendering result while traning. Yields ------ score : Evaluation scores. """ tf.reset_default_graph() # env to get config dummy_env = gym.make(config.env) def normalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0 return observ def normalize_action(act): min_ = dummy_env.action_space[3].low max_ = dummy_env.action_space[3].high act = 2.0 * (act - min_) / (max_ - min_) - 1.0 return act def denormalize_observ(observ): min_ = dummy_env.observation_space.low[0] max_ = dummy_env.observation_space.high[0] observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_ return observ # env to testing vanilla_env = gym.make(config.env) vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False, time_limit=50) vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.train_len, config.D_len)), if_back_real=False, video_callable=lambda _: True, # init from dataset init_mode=1) vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy') # env to generate fake state env = gym.make(config.env) env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real, time_limit=config.max_length) env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real, # init from dataset in order init_mode=3) # PPO graph if config.update_every % config.num_agents: tf.logging.warn('Number of agents should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail) graph = utility.define_simulation_graph( batch_env, config.algorithm, config) loop = _define_loop( graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int( config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Agent to genrate acttion ppo_policy = PPOPolicy(config, env) # Data all_data = h5py.File( 'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r') expert_data, valid_expert_data = np.split( all_data['OBS'].value, [all_data['OBS'].value.shape[0] * 9 // 10]) expert_action, valid_expert_action = np.split( all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0] * 9 // 10]) print('expert_data', expert_data.shape) print('valid_expert_data', valid_expert_data.shape) print('expert_action', expert_action.shape) print('valid_expert_action', valid_expert_action.shape) # Preprocessing/ Normalization expert_data = normalize_observ(expert_data) valid_expert_data = normalize_observ(valid_expert_data) expert_action = normalize_action(expert_action) valid_expert_action = normalize_action(valid_expert_action) # summary writer of Discriminator summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator') # TF Session # TODO _num_finished_episodes => Variable:0 saver = utility.define_saver( exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*')) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=config.log_device_placement) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables( sess, saver, config.logdir, resume=FLAGS.resume) # NOTE reset variables in optimizer # opt_reset_D = tf.group( # [v.initializer for v in graph.algo.D.optimizer.variables()]) # # reset PPO optimizer # opt_reset = tf.group( # [v.initializer for v in graph.algo._optimizer.variables()]) # sess.run([opt_reset, opt_reset_D]) # visulization stuff if FLAGS.tally_only: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action) exit() # GAIL cumulate_steps = sess.run(graph.step) episode_idx = 0 while True: if episode_idx > (expert_data.shape[0] - config.episodes_per_batch * config.train_d_per_ppo) or episode_idx == 0: episode_idx = 0 perm_idx = np.random.permutation(expert_data.shape[0]) expert_data = expert_data[perm_idx] expert_action = expert_action[perm_idx] # # testing if episode_idx % (config.train_d_per_ppo * 100 * config.episodes_per_batch) == 0: test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ) if episode_idx % (config.train_d_per_ppo * 1000 * config.episodes_per_batch) == 0: tally_reward_line_chart(config, sess.run( graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ, normalize_observ, normalize_action) # # train Discriminator gail_timer = time.time() for _ in range(config.train_d_per_ppo): if config.is_double_curiculum: observ = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:] action = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1] if config.use_padding: # 1. padding with buffer buffer = observ[:, 0, :-1] padded_observ = np.concatenate([buffer, observ[:, :, -1]], axis=1) padded_act = np.concatenate([np.zeros(shape=[action.shape[0], 9, 5, 2]), action], axis=1) # 2. split the whole episode into training data of Discriminator with length=config.D_len training_obs = [] training_act = [] for i in range(config.max_length-config.D_len+10): training_obs.append(padded_observ[:, i:i+config.D_len]) training_act.append(padded_act[:, i:i+config.D_len]) training_obs = np.concatenate(training_obs, axis=0) training_act = np.concatenate(training_act, axis=0) else: pass else: training_obs = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:, -1] training_act = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1] feed_dict = { graph.is_training: True, graph.should_log: True, graph.do_report: True, graph.force_reset: False, graph.algo.D._expert_s: training_obs, graph.algo.D._expert_a: training_act} gail_counter = 0 while gail_counter < config.gail_steps: gail_summary = sess.run( graph.gail_summary, feed_dict=feed_dict) if gail_summary: summary_writer.add_summary( gail_summary, global_step=sess.run(graph.algo.D._steps)) gail_counter += 1 episode_idx += config.episodes_per_batch print('Time Cost of Discriminator per Update: {}'.format( (time.time() - gail_timer) / config.train_d_per_ppo)) # train ppo cumulate_steps += total_steps for score in loop.run(sess, saver, cumulate_steps): yield score batch_env.close() vanilla_env.close() env.close()