def get_demo(args): if args.demo_memory_folder is not None: demo_memory_folder = 'collected_demo/{}'.format( args.demo_memory_folder) else: demo_memory_folder = 'collected_demo/{}'.format( args.gym_env.replace('-', '_')) if args.append_experiment_num is not None: demo_memory_folder += '_' + args.append_experiment_num if args.hostname is None: hostname = os.uname()[1] else: hostname = args.hostname prepare_dir(demo_memory_folder + '/log/{}'.format(hostname), empty=False) prepare_dir(demo_memory_folder + '/data/{}'.format(hostname), empty=False) episode_life = not args.not_episodic_life datetime_collected = datetime.today().strftime('%Y%m%d_%H%M%S') log_file = '{}.log'.format(datetime_collected) fh = logging.FileHandler('{}/log/{}/{}'.format(demo_memory_folder, hostname, log_file), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) logging.getLogger('collect_demo').addHandler(fh) logging.getLogger('game_state').addHandler(fh) logging.getLogger('replay_memory').addHandler(fh) logging.getLogger('atari_wrapper').addHandler(fh) game_state = GameState(env_id=args.gym_env, display=True, human_demo=True, episode_life=episode_life) collect_demo = CollectDemonstration(game_state, 84, 84, 4, args.gym_env, folder=demo_memory_folder, create_movie=args.create_movie, hertz=args.hz, skip=args.skip) collect_demo.run_episodes(args.num_episodes, minutes_limit=args.demo_time_limit, demo_type=0, log_file=log_file, hostname=hostname) game_state.close()
def test_collect(env_id): from common.game_state import GameState game_state = GameState(env_id=env_id, display=True, human_demo=True) test_folder = "demo_samples/{}_test".format(env_id.replace('-', '_')) prepare_dir(test_folder, empty=True) collect_demo = CollectDemonstration( game_state, 84, 84, 4, env_id, folder=test_folder, create_movie=True) num_episodes = 1 collect_demo.run_episodes( num_episodes, minutes_limit=3, demo_type=0)
def run_a3c(args): """ python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> --load-pretrained-model --onevsall-mtl --pretrained-model-folder=<> --use-pretrained-model-as-advice --use-pretrained-model-as-reward-shaping """ from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf def log_uniform(lo, hi, rate): log_lo = math.log(lo) log_hi = math.log(hi) v = log_lo * (1 - rate) + log_hi * rate return math.exp(v) if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = 'results/a3c/{}_{}'.format(args.gym_env.replace('-', '_'), args.folder) else: folder = 'results/a3c/{}'.format(args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' folder += end_str if args.append_experiment_num is not None: folder += '_' + args.append_experiment_num if False: from common.util import LogFormatter fh = logging.FileHandler('{}/a3c.log'.format(folder), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) demo_memory = None num_demos = 0 max_reward = 0. if args.load_memory or args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format( args.gym_env.replace('-', '_')) if args.load_memory: # FIXME: use new load_memory function demo_memory, actions_ctr, max_reward = load_memory( args.gym_env, demo_memory_folder, imgs_normalized=True) #, create_symmetry=True) action_freq = [ actions_ctr[a] for a in range(demo_memory[0].num_actions) ] num_demos = len(demo_memory) demo_memory_cam = None if args.load_demo_cam: demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] demo_memory_cam = np.zeros((len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0 = (demo_cam[i])[0] demo_memory_cam[i] = np.copy(s0) del demo_cam logger.info("loaded demo {} for testing CAM".format(args.demo_cam_id)) device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:" + os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 pretrain_global_t = 0 pretrain_epoch = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) pretrained_model = None pretrained_model_sess = None if args.load_pretrained_model: if args.onevsall_mtl: from game_class_network import MTLBinaryClassNetwork as PretrainedModelNetwork elif args.onevsall_mtl_linear: from game_class_network import MTLMultivariateNetwork as PretrainedModelNetwork else: from game_class_network import MultiClassNetwork as PretrainedModelNetwork logger.error("Not supported yet!") assert False if args.pretrained_model_folder is not None: pretrained_model_folder = args.pretrained_model_folder else: pretrained_model_folder = '{}_classifier_use_mnih_onevsall_mtl'.format( args.gym_env.replace('-', '_')) PretrainedModelNetwork.use_mnih_2015 = args.use_mnih_2015 pretrained_model = PretrainedModelNetwork(action_size, -1, device) pretrained_model_sess = tf.Session(config=config, graph=pretrained_model.graph) pretrained_model.load( pretrained_model_sess, '{}/{}_checkpoint'.format(pretrained_model_folder, args.gym_env.replace('-', '_'))) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer(learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" n_shapers = args.parallel_size #int(args.parallel_size * .25) mod = args.parallel_size // n_shapers for i in range(args.parallel_size): is_reward_shape = False is_advice = False if i % mod == 0: is_reward_shape = args.use_pretrained_model_as_reward_shaping is_advice = args.use_pretrained_model_as_advice training_thread = A3CTrainingThread( i, global_network, initial_learning_rate, learning_rate_input, grad_applier, args.max_time_step, device=device, pretrained_model=pretrained_model, pretrained_model_sess=pretrained_model_sess, advice=is_advice, reward_shaping=is_reward_shape) training_threads.append(training_thread) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format( args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' #TODO: make this an argument transfer_folder += end_str transfer_folder += '/transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1 ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2 ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # summary writer for tensorboard summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter( 'results/log/a3c/{}/'.format(args.gym_env.replace('-', '_')) + folder[12:], sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=6) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(folder) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) # set wall time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'r') as f: wall_t = float(f.read()) with open(folder + '/pretrain_global_t', 'r') as f: pretrain_global_t = int(f.read()) with open(folder + '/model_best/best_model_reward', 'r') as f_best_model_reward: best_model_reward = float(f_best_model_reward.read()) rewards = pickle.load( open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'rb')) else: logger.warning("Could not find old checkpoint") # set wall time wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder + '/model_checkpoints', empty=True) prepare_dir(folder + '/model_best', empty=True) prepare_dir(folder + '/frames', empty=True) lock = threading.Lock() test_lock = False if global_t == 0: test_lock = True last_temp_global_t = global_t ispretrain_markers = [False] * args.parallel_size num_demo_thread = 0 ctr_demo_thread = 0 def train_function(parallel_index): nonlocal global_t, pretrain_global_t, pretrain_epoch, \ rewards, test_lock, lock, \ last_temp_global_t, ispretrain_markers, num_demo_thread, \ ctr_demo_thread training_thread = training_threads[parallel_index] training_thread.set_summary_writer(summary_writer) # set all threads as demo threads training_thread.is_demo_thread = args.load_memory and args.use_demo_threads if training_thread.is_demo_thread or args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs: training_thread.pretrain_init(demo_memory) if global_t == 0 and ( args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0) and parallel_index < 2: ispretrain_markers[parallel_index] = True training_thread.replay_mem_reset() # Pretraining with demo memory logger.info("t_idx={} pretrain starting".format(parallel_index)) while ispretrain_markers[parallel_index]: if stop_requested: return if pretrain_global_t > args.train_with_demo_num_steps and pretrain_epoch > args.train_with_demo_num_epochs: # At end of pretraining, reset state training_thread.replay_mem_reset() training_thread.episode_reward = 0 training_thread.local_t = 0 if args.use_lstm: training_thread.local_network.reset_state() ispretrain_markers[parallel_index] = False logger.info( "t_idx={} pretrain ended".format(parallel_index)) break diff_pretrain_global_t, _ = training_thread.demo_process( sess, pretrain_global_t) for _ in range(diff_pretrain_global_t): pretrain_global_t += 1 if pretrain_global_t % 10000 == 0: logger.debug( "pretrain_global_t={}".format(pretrain_global_t)) pretrain_epoch += 1 if pretrain_epoch % 1000 == 0: logger.debug("pretrain_epoch={}".format(pretrain_epoch)) # Waits for all threads to finish pretraining while not stop_requested and any(ispretrain_markers): time.sleep(0.01) # Evaluate model before training if not stop_requested and global_t == 0: with lock: if parallel_index == 0: test_reward, test_steps, test_episodes = training_threads[ 0].testing(sess, args.eval_max_steps, global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][global_t] = (test_reward, test_steps, test_episodes) saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t) save_best_model(test_reward) test_lock = False # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) # set start_time start_time = time.time() - wall_t training_thread.set_start_time(start_time) episode_end = True use_demo_thread = False while True: if stop_requested: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if args.use_demo_threads and global_t < args.max_steps_threads_as_demo and episode_end and num_demo_thread < 16: #if num_demo_thread < 2: demo_rate = 1.0 * (args.max_steps_threads_as_demo - global_t) / args.max_steps_threads_as_demo if demo_rate < 0.0333: demo_rate = 0.0333 if np.random.random() <= demo_rate and num_demo_thread < 16: ctr_demo_thread += 1 training_thread.replay_mem_reset(D_idx=ctr_demo_thread % num_demos) num_demo_thread += 1 logger.info( "idx={} as demo thread started ({}/16) rate={}".format( parallel_index, num_demo_thread, demo_rate)) use_demo_thread = True if use_demo_thread: diff_global_t, episode_end = training_thread.demo_process( sess, global_t) if episode_end: num_demo_thread -= 1 use_demo_thread = False logger.info("idx={} demo thread concluded ({}/16)".format( parallel_index, num_demo_thread)) else: diff_global_t, episode_end = training_thread.process( sess, global_t, rewards) for _ in range(diff_global_t): global_t += 1 if global_t % args.eval_freq == 0: temp_global_t = global_t lock.acquire() try: # catch multiple threads getting in at the same time if last_temp_global_t == temp_global_t: logger.info("Threading race problem averted!") continue test_lock = True test_reward, test_steps, n_episodes = training_thread.testing( sess, args.eval_max_steps, temp_global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][temp_global_t] = (test_reward, test_steps, n_episodes) if temp_global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save(sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format( args.gym_env.replace('-', '_')), global_step=temp_global_t, write_meta_graph=False) if test_reward > best_model_reward: save_best_model(test_reward) test_lock = False last_temp_global_t = temp_global_t finally: lock.release() if global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t, write_meta_graph=False) # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward best_model_reward = test_reward with open(folder + '/model_best/best_model_reward', 'w') as f_best_model_reward: f_best_model_reward.write(str(best_model_reward)) best_saver.save( sess, folder + '/model_best/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_'))) train_threads = [] for i in range(args.parallel_size): train_threads.append( threading.Thread(target=train_function, args=(i, ))) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) with open(folder + '/pretrain_global_t', 'w') as f: f.write(str(pretrain_global_t)) root_saver.save( sess, folder + '/{}_checkpoint_a3c'.format(args.gym_env.replace('-', '_')), global_step=global_t) pickle.dump( rewards, open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) logger.info('Data saved!') sess.close()
def run_a3c_test(args): """Run A3C testing.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = args.folder else: folder = 'results/a3c/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 \ or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' if args.padding == 'SAME': end_str += '_same' folder += end_str folder = pathlib.Path(folder) demo_memory_cam = None demo_cam_human = False if args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME) demo_memory_folder = pathlib.Path(demo_memory_folder) if args.demo_cam_id is not None: demo_cam_human = True demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] logger.info("loaded demo {} for testing CAM".format( args.demo_cam_id)) else: demo_cam_folder = pathlib.Path(args.demo_cam_folder) demo_cam = ReplayMemory() demo_cam.load(name='test_cam', folder=demo_cam_folder) logger.info("loaded demo {} for testing CAM".format( str(demo_cam_folder / 'test_cam'))) demo_memory_cam = np.zeros( (len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0, _, _, _, _, _, t1, _ = demo_cam[i] demo_memory_cam[i] = np.copy(s0) del demo_cam device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) input_shape = (84, 84, 4) if args.padding == 'VALID' else (88, 88, 4) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = \ args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" if args.use_lstm: local_network = GameACLSTMNetwork(action_size, 0, device) else: local_network = GameACFFNetwork( action_size, 0, device, padding=args.padding, in_shape=input_shape) testing_thread = A3CTrainingThread( 0, global_network, local_network, initial_learning_rate, learning_rate_input, grad_applier, 0, device=device) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' # TODO: make this an argument transfer_folder += end_str transfer_folder = pathlib.Path(transfer_folder) transfer_folder /= 'transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list, ) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) else: logger.warning("Could not find old checkpoint") def test_function(): nonlocal global_t if args.use_transfer: from_folder = str(transfer_folder).split('/')[-2] else: from_folder = str(folder).split('/')[-1] from_folder = pathlib.Path(from_folder) save_folder = 'results/test_model/a3c' / from_folder prepare_dir(str(save_folder), empty=False) prepare_dir(str(save_folder / 'frames'), empty=False) # Evaluate model before training if not stop_requested: testing_thread.testing_model( sess, args.eval_max_steps, global_t, save_folder, demo_memory_cam=demo_memory_cam, demo_cam_human=demo_cam_human) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) test_thread = threading.Thread(target=test_function, args=()) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) test_thread.start() print('Press Ctrl+C to stop') test_thread.join() sess.close()
def ae_classify_demo(args): """Use Autoencoder to learn features and classify demo.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') if args.cpu_only: os.environ['CUDA_VISIBLE_DEVICES'] = '' else: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices import tensorflow as tf if args.cpu_only: device = "/cpu:0" gpu_options = None else: device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) config = tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=False) if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME) demo_memory_folder = pathlib.Path(demo_memory_folder) args.use_mnih_2015 = True # ONLY supports this network if args.model_folder is not None: model_folder = '{}_{}'.format(GYM_ENV_NAME, args.model_folder) else: model_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.padding == 'SAME': end_str += '_same' if args.optimizer == 'adam': end_str += '_adam' if args.exclude_noop: end_str += '_exclude_noop' if args.exclude_num_demo_ep > 0: end_str += '_exclude{}demoeps'.format(args.exclude_num_demo_ep) if args.l2_beta > 0: end_str += '_l2beta{:.0E}'.format(args.l2_beta) if args.l1_beta > 0: end_str += '_l1beta{:.0E}'.format(args.l1_beta) if args.grad_norm_clip is not None: end_str += '_clipnorm{:.0E}'.format(args.grad_norm_clip) if args.sampling_type is not None: end_str += '_{}'.format(args.sampling_type) if args.sae_classify_demo: end_str += '_sae' args.ae_classify_demo = False else: end_str += '_ae' args.sae_classify_demo = False if args.use_slv: end_str += '_slv' if args.sl_loss_weight < 1: end_str += '_slweight{:.0E}'.format(args.sl_loss_weight) if args.use_denoising: end_str += '_noise{:.0E}'.format(args.noise_factor) if args.tied_weights: end_str += '_tied' if args.loss_function == 'bce': end_str += '_bce' else: end_str += '_mse' model_folder += end_str if args.append_experiment_num is not None: model_folder += '_' + args.append_experiment_num model_folder = pathlib.Path(model_folder) if not (model_folder / 'transfer_model').exists(): os.makedirs(str(model_folder / 'transfer_model')) os.makedirs(str(model_folder / 'transfer_model/all')) os.makedirs(str(model_folder / 'transfer_model/nofc2')) os.makedirs(str(model_folder / 'transfer_model/nofc1')) if args.use_mnih_2015: os.makedirs(str(model_folder / 'transfer_model/noconv3')) os.makedirs(str(model_folder / 'transfer_model/noconv2')) os.makedirs(str(model_folder / 'model_best')) fh = logging.FileHandler(str(model_folder / 'classify.log'), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) logging.getLogger('atari_wrapper').addHandler(fh) logging.getLogger('network').addHandler(fh) logging.getLogger('deep_rl').addHandler(fh) logging.getLogger('replay_memory').addHandler(fh) game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n AutoEncoderNetwork.use_mnih_2015 = True # ONLY supports mnih_2015 AutoEncoderNetwork.l1_beta = args.l1_beta AutoEncoderNetwork.l2_beta = args.l2_beta AutoEncoderNetwork.use_gpu = not args.cpu_only network = AutoEncoderNetwork( action_size, -1, device, padding=args.padding, in_shape=(args.input_shape, args.input_shape, 4), sae=args.sae_classify_demo, tied_weights=args.tied_weights, use_denoising=args.use_denoising, noise_factor=args.noise_factor, loss_function=args.loss_function, use_slv=args.use_slv) logger.info("optimizer: {}".format( 'RMSPropOptimizer' if args.optimizer == 'rms' else 'AdamOptimizer')) logger.info("\tlearning_rate: {}".format(args.learn_rate)) logger.info("\tepsilon: {}".format(args.opt_epsilon)) if args.optimizer == 'rms': logger.info("\tdecay: {}".format(args.opt_alpha)) else: # Adam # Tensorflow defaults beta1 = 0.9 beta2 = 0.999 with tf.device(device): ae_opt = None if args.optimizer == 'rms': if args.ae_classify_demo: ae_opt = tf.train.RMSPropOptimizer( learning_rate=args.learn_rate, decay=args.opt_alpha, epsilon=args.opt_epsilon, ) opt = tf.train.RMSPropOptimizer( learning_rate=args.learn_rate, decay=args.opt_alpha, epsilon=args.opt_epsilon, ) else: # Adam if args.ae_classify_demo: ae_opt = tf.train.AdamOptimizer( learning_rate=args.learn_rate, beta1=beta1, beta2=beta2, epsilon=args.opt_epsilon, ) opt = tf.train.AdamOptimizer( learning_rate=args.learn_rate, beta1=beta1, beta2=beta2, epsilon=args.opt_epsilon, ) ae_classify_demo = AutoencoderClassifyDemo( tf, network, args.gym_env, int(args.train_max_steps), args.batch_size, ae_opt, opt, eval_freq=args.eval_freq, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_ids, folder=model_folder, exclude_num_demo_ep=args.exclude_num_demo_ep, use_onevsall=args.onevsall_mtl, device=device, clip_norm=args.grad_norm_clip, game_state=game_state, sampling_type=args.sampling_type, sl_loss_weight=args.sl_loss_weight, reward_constant=args.reward_constant, ) # prepare session sess = tf.Session(config=config, graph=network.graph) with network.graph.as_default(): init = tf.global_variables_initializer() sess.run(init) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(str(model_folder / 'log'), sess.graph) # init or load checkpoint with saver with network.graph.as_default(): saver = tf.train.Saver() best_saver = tf.train.Saver(max_to_keep=1) def signal_handler(signal, frame): nonlocal ae_classify_demo logger.info('You pressed Ctrl+C!') ae_classify_demo.stop_requested = True signal.signal(signal.SIGINT, signal_handler) print('Press Ctrl+C to stop') if args.ae_classify_demo: ae_classify_demo.train_autoencoder(sess, summary_op, summary_writer) # else: ae_classify_demo.train(sess, summary_op, summary_writer, best_saver=best_saver) logger.info('Now saving data. Please wait') saver.save(sess, str(model_folder / '{}_checkpoint'.format(GYM_ENV_NAME))) with network.graph.as_default(): transfer_params = tf.get_collection("transfer_params") transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, str(model_folder / 'transfer_model/all' / '{}_transfer_params'.format(GYM_ENV_NAME))) # Remove fc2/fc3 weights for param in transfer_params[:]: name = param.op.name if name == "net_-1/fc2_weights" or name == "net_-1/fc2_biases": transfer_params.remove(param) elif name == "net_-1/fc3_weights" or name == "net_-1/fc3_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, str(model_folder / 'transfer_model/nofc2' / '{}_transfer_params'.format(GYM_ENV_NAME))) # Remove fc1 weights for param in transfer_params[:]: name = param.op.name if name == "net_-1/fc1_weights" or name == "net_-1/fc1_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, str(model_folder / 'transfer_model/nofc1' / '{}_transfer_params'.format(GYM_ENV_NAME))) # Remove conv3 weights if args.use_mnih_2015: for param in transfer_params[:]: name = param.op.name if name == "net_-1/conv3_weights" or name == "net_-1/conv3_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, str(model_folder / 'transfer_model/noconv3' / '{}_transfer_params'.format(GYM_ENV_NAME))) # Remove conv2 weights for param in transfer_params[:]: name = param.op.name if name == "net_-1/conv2_weights" or name == "net_-1/conv2_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, str(model_folder / 'transfer_model/noconv2' / '{}_transfer_params'.format(GYM_ENV_NAME))) # if args.sae_classify_demo: max_output_value_file = model_folder / 'transfer_model/max_output_value' with max_output_value_file.open('w') as f: f.write(str(ae_classify_demo.max_val)) logger.info('Data saved!') sess.close()
def run_a3c(args): """Run A3C experiment.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') GAME_NAME = args.gym_env.replace('NoFrameskip-v4','') # setup folder name and path to folder folder = pathlib.Path(setup_folder(args, GYM_ENV_NAME)) # setup GPU (if applicable) import tensorflow as tf gpu_options = setup_gpu(tf, args.use_gpu, args.gpu_fraction) ###################################################### # setup default device device = "/cpu:0" global_t = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) if args.load_pretrained_model: class_rewards = {'class_eval': {}} # setup logging info for analysis, see Section 4.2 of the paper sil_dict = { # count number of SIL updates "sil_ctr":{}, # total number of butter D sampled during SIL "sil_a3c_sampled":{}, # total number of buffer D samples (i.e., generated by A3C workers) used during SIL (i.e., passed max op) "sil_a3c_used":{}, # the return of used samples for buffer D "sil_a3c_used_return":{}, # total number of buffer R sampled during SIL "sil_rollout_sampled":{}, # total number of buffer R samples (i.e., generated by refresher worker) used during SIL (i.e., passed max op) "sil_rollout_used":{}, # the return of used samples for buffer R "sil_rollout_used_return":{}, # number of old samples still used (even after refreshing) "sil_old_used":{} } sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return = 0, 0, 0, 0 sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return = 0, 0, 0 sil_old_used = 0 rollout_dict = { # total number of rollout performed "rollout_ctr": {}, # total number of successful rollout (i.e., Gnew > G) "rollout_added_ctr":{}, # the return of Gnew "rollout_new_return":{}, # the return of G "rollout_old_return":{} } rollout_ctr, rollout_added_ctr = 0, 0 rollout_new_return = 0 # this records the total, avg = this / rollout_added_ctr rollout_old_return = 0 # this records the total, avg = this / rollout_added_ctr # setup file names reward_fname = folder / '{}-a3c-rewards.pkl'.format(GYM_ENV_NAME) sil_fname = folder / '{}-a3c-dict-sil.pkl'.format(GYM_ENV_NAME) rollout_fname = folder / '{}-a3c-dict-rollout.pkl'.format(GYM_ENV_NAME) if args.load_pretrained_model: class_reward_fname = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) sharedmem_fname = folder / '{}-sharedmem.pkl'.format(GYM_ENV_NAME) sharedmem_params_fname = folder / '{}-sharedmem-params.pkl'.format(GYM_ENV_NAME) sharedmem_trees_fname = folder / '{}-sharedmem-trees.pkl'.format(GYM_ENV_NAME) rolloutmem_fname = folder / '{}-rolloutmem.pkl'.format(GYM_ENV_NAME) rolloutmem_params_fname = folder / '{}-rolloutmem-params.pkl'.format(GYM_ENV_NAME) rolloutmem_trees_fname = folder / '{}-rolloutmem-trees.pkl'.format(GYM_ENV_NAME) # for removing older ckpt, save mem space prev_ckpt_t = -1 stop_req = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state input_shape = (args.input_shape, args.input_shape, 4) ####################################################### # setup global A3C GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) logger.info('A3C Initial Learning Rate={}'.format(args.initial_learn_rate)) # setup pretrained model global_pretrained_model = None local_pretrained_model = None pretrain_graph = None # if use pretrained model to refresh # then must load pretrained model # otherwise, don't load model if args.use_lider and args.nstep_bc > 0: assert args.load_pretrained_model, "refreshing with other policies, must load a pre-trained model (TA or BC)" else: assert not args.load_pretrained_model, "refreshing with the current policy, don't load pre-trained models" if args.load_pretrained_model: pretrain_graph, global_pretrained_model = setup_pretrained_model(tf, args, action_size, input_shape, device="/gpu:0" if args.use_gpu else device) assert global_pretrained_model is not None assert pretrain_graph is not None time.sleep(2.0) # setup experience memory shared_memory = None # => this is BufferD rollout_buffer = None # => this is BufferR if args.use_sil: shared_memory = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) if args.use_lider and not args.onebuffer: rollout_buffer = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) # log memory information shared_memory.log() if args.use_lider and not args.onebuffer: rollout_buffer.log() ############## Setup Thread Workers BEGIN ################ # 17 total number of threads for all experiments assert args.parallel_size ==17, "use 17 workers for all experiments" startIndex = 0 all_workers = [] # a3c and sil learning rate and optimizer learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) setup_common_worker(CommonWorker, args, action_size) # setup SIL worker sil_worker = None if args.use_sil: _device = "/gpu:0" if args.use_gpu else device sil_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) sil_worker = SILTrainingThread(startIndex, global_network, sil_network, args.initial_learn_rate, learning_rate_input, grad_applier, device=_device, batch_size=args.batch_size, use_rollout=args.use_lider, one_buffer=args.onebuffer, sampleR=args.sampleR) all_workers.append(sil_worker) startIndex += 1 # setup refresh worker refresh_worker = None if args.use_lider: _device = "/gpu:0" if args.use_gpu else device refresh_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) refresh_local_pretrained_model = None # if refreshing with other polies if args.nstep_bc > 0: refresh_local_pretrained_model = PretrainedModelNetwork( pretrain_graph, action_size, startIndex, padding=args.padding, in_shape=input_shape, sae=False, tied_weights=False, use_denoising=False, noise_factor=0.3, loss_function='mse', use_slv=False, device=_device) refresh_worker = RefreshThread( thread_index=startIndex, action_size=action_size, env_id=args.gym_env, global_a3c=global_network, local_a3c=refresh_network, update_in_rollout=args.update_in_rollout, nstep_bc=args.nstep_bc, global_pretrained_model=global_pretrained_model, local_pretrained_model=refresh_local_pretrained_model, transformed_bellman = args.transformed_bellman, device=_device, entropy_beta=args.entropy_beta, clip_norm=args.grad_norm_clip, grad_applier=grad_applier, initial_learn_rate=args.initial_learn_rate, learning_rate_input=learning_rate_input) all_workers.append(refresh_worker) startIndex += 1 # setup a3c workers setup_a3c_worker(A3CTrainingThread, args, startIndex) for i in range(startIndex, args.parallel_size): local_network = GameACFFNetwork( action_size, i, device="/cpu:0", padding=args.padding, in_shape=input_shape) a3c_worker = A3CTrainingThread( i, global_network, local_network, args.initial_learn_rate, learning_rate_input, grad_applier, device="/cpu:0", no_op_max=30) all_workers.append(a3c_worker) ############## Setup Thread Workers END ################ # setup config for tensorflow config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) # prepare sessions sess = tf.Session(config=config) pretrain_sess = None if global_pretrained_model: pretrain_sess = tf.Session(config=config, graph=pretrain_graph) # initial pretrained model if pretrain_sess: assert args.pretrained_model_folder is not None global_pretrained_model.load( pretrain_sess, args.pretrained_model_folder) sess.run(tf.global_variables_initializer()) if global_pretrained_model: initialize_uninitialized(tf, pretrain_sess, global_pretrained_model) if local_pretrained_model: initialize_uninitialized(tf, pretrain_sess, local_pretrained_model) # summary writer for tensorboard summ_file = args.save_to+'log/a3c/{}/'.format(GYM_ENV_NAME) + str(folder)[58:] # str(folder)[12:] summary_writer = tf.summary.FileWriter(summ_file, sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=1) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)+'/model_checkpoints') if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) tmp_t = (global_t // args.eval_freq) * args.eval_freq logger.info(">>> tmp_t: {}".format(tmp_t)) # set wall time wall_t = 0. # set up reward files best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('r') as f: best_model_reward = float(f.read()) # restore rewards rewards = restore_dict(reward_fname, global_t) logger.info(">>> restored: rewards") # restore loggings sil_dict = restore_dict(sil_fname, global_t) sil_ctr = sil_dict['sil_ctr'][tmp_t] sil_a3c_sampled = sil_dict['sil_a3c_sampled'][tmp_t] sil_a3c_used = sil_dict['sil_a3c_used'][tmp_t] sil_a3c_used_return = sil_dict['sil_a3c_used_return'][tmp_t] sil_rollout_sampled = sil_dict['sil_rollout_sampled'][tmp_t] sil_rollout_used = sil_dict['sil_rollout_used'][tmp_t] sil_rollout_used_return = sil_dict['sil_rollout_used_return'][tmp_t] sil_old_used = sil_dict['sil_old_used'][tmp_t] logger.info(">>> restored: sil_dict") rollout_dict = restore_dict(rollout_fname, global_t) rollout_ctr = rollout_dict['rollout_ctr'][tmp_t] rollout_added_ctr = rollout_dict['rollout_added_ctr'][tmp_t] rollout_new_return = rollout_dict['rollout_new_return'][tmp_t] rollout_old_return = rollout_dict['rollout_old_return'][tmp_t] logger.info(">>> restored: rollout_dict") if args.load_pretrained_model: class_reward_file = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) class_rewards = restore_dict(class_reward_file, global_t) # restore replay buffers (if saved) if args.checkpoint_buffer: # restore buffer D if args.use_sil and args.priority_memory: shared_memory = restore_buffer(sharedmem_fname, shared_memory, global_t) shared_memory = restore_buffer_trees(sharedmem_trees_fname, shared_memory, global_t) shared_memory = restore_buffer_params(sharedmem_params_fname, shared_memory, global_t) logger.info(">>> restored: shared_memory (Buffer D)") shared_memory.log() # restore buffer R if args.use_lider and not args.onebuffer: rollout_buffer = restore_buffer(rolloutmem_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_trees(rolloutmem_trees_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_params(rolloutmem_params_fname, rollout_buffer, global_t) logger.info(">>> restored: rollout_buffer (Buffer R)") rollout_buffer.log() # if all restores okay, remove old ckpt to save storage space prev_ckpt_t = global_t else: logger.warning("Could not find old checkpoint") wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder / 'model_checkpoints', empty=True) prepare_dir(folder / 'model_best', empty=True) prepare_dir(folder / 'frames', empty=True) lock = threading.Lock() # next saving global_t def next_t(current_t, freq): return np.ceil((current_t + 0.00001) / freq) * freq next_global_t = next_t(global_t, args.eval_freq) next_save_t = next_t( global_t, args.eval_freq*args.checkpoint_freq) step_t = 0 def train_function(parallel_idx, th_ctr, ep_queue, net_updates): nonlocal global_t, step_t, rewards, class_rewards, lock, \ next_save_t, next_global_t, prev_ckpt_t nonlocal shared_memory, rollout_buffer nonlocal sil_dict, sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return, \ sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return, \ sil_old_used nonlocal rollout_dict, rollout_ctr, rollout_added_ctr, \ rollout_new_return, rollout_old_return parallel_worker = all_workers[parallel_idx] parallel_worker.set_summary_writer(summary_writer) with lock: # Evaluate model before training if not stop_req and global_t == 0 and step_t == 0: rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, global_t, folder, worker=all_workers[-1]) # testing pretrained TA or BC in game if args.load_pretrained_model: assert pretrain_sess is not None assert global_pretrained_model is not None class_rewards['class_eval'][step_t] = \ parallel_worker.test_loaded_classifier(global_t=global_t, max_eps=50, # testing 50 episodes sess=pretrain_sess, worker=all_workers[-1], model=global_pretrained_model) # log pretrained model performance class_eval_file = pathlib.Path(args.pretrained_model_folder[:21]+\ str(GAME_NAME)+"/"+str(GAME_NAME)+'-model-eval.txt') class_std = np.std(class_rewards['class_eval'][step_t][-1]) class_mean = np.mean(class_rewards['class_eval'][step_t][-1]) with class_eval_file.open('w') as f: f.write("class_mean: \n" + str(class_mean) + "\n") f.write("class_std: \n" + str(class_std) + "\n") f.write("class_rewards: \n" + str(class_rewards['class_eval'][step_t][-1]) + "\n") checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t) save_best_model(rewards['eval'][global_t][0]) # saving worker info to dicts for analysis sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works under priority mem) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saving shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saving rollout_buffer') prev_ckpt_t = global_t step_t = 1 # set start_time start_time = time.time() - wall_t parallel_worker.set_start_time(start_time) if parallel_worker.is_sil_thread: sil_interval = 0 # bigger number => slower SIL updates m_repeat = 4 min_mem = args.batch_size * m_repeat sil_train_flag = len(shared_memory) >= min_mem while True: if stop_req: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if parallel_worker.is_sil_thread: # before sil starts, init local count local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 if net_updates.qsize() >= sil_interval \ and len(shared_memory) >= min_mem: sil_train_flag = True if sil_train_flag: sil_train_flag = False th_ctr.get() train_out = parallel_worker.sil_train( sess, global_t, shared_memory, m_repeat, rollout_buffer=rollout_buffer) local_sil_ctr, local_sil_a3c_sampled, local_sil_a3c_used, \ local_sil_a3c_used_return, \ local_sil_rollout_sampled, local_sil_rollout_used, \ local_sil_rollout_used_return, \ local_sil_old_used = train_out th_ctr.put(1) with net_updates.mutex: net_updates.queue.clear() if args.use_lider: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, a3c_used_return=sil_a3c_used_return/(sil_a3c_used+1),#add one in case divide by zero rollout_used=sil_rollout_used, rollout_used_return=sil_rollout_used_return/(sil_rollout_used+1), old_used=sil_old_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: rollout_buffsize = 0 if not args.onebuffer: rollout_buffsize = len(rollout_buffer) log_data = (sil_ctr, len(shared_memory), rollout_buffsize, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, sil_a3c_used_return/(sil_a3c_used+1), sil_rollout_used, sil_rollout_used_return/(sil_rollout_used+1), sil_old_used) logger.info("SIL: sil_ctr={0:}" " sil_memory_size={1:}" " rollout_buffer_size={2:}" " total_sample_used={3:}/{4:}" " a3c_used={5:}" " a3c_used_return_avg={6:.2f}" " rollout_used={7:}" " rollout_used_return_avg={8:.2f}" " old_used={9:}".format(*log_data)) else: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, rollout_used=sil_rollout_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: log_data = (sil_ctr, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, len(shared_memory)) logger.info("SIL: sil_ctr={0:}" " total_sample_used={1:}/{2:}" " a3c_used={3:}" " sil_memory_size={4:}".format(*log_data)) # Adding episodes to SIL memory is centralize to ensure # sampling and updating of priorities does not become a problem # since we add new episodes to SIL at once and during # SIL training it is guaranteed that SIL memory is untouched. max = args.parallel_size while not ep_queue.empty(): data = ep_queue.get() parallel_worker.episode.set_data(*data) shared_memory.extend(parallel_worker.episode) parallel_worker.episode.reset() max -= 1 if max <= 0: # This ensures that SIL has a chance to train break diff_global_t = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 elif parallel_worker.is_refresh_thread: # before refresh starts, init local count diff_global_t = 0 local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 if len(shared_memory) >= 1: th_ctr.get() # randomly sample a state from buffer D sample = shared_memory.sample_one_random() # after sample, flip refreshed to True # TODO: fix this so that only *succesful* refresh is flipped to True # currently counting *all* refresh as True assert sample[-1] == True train_out = parallel_worker.rollout(sess, folder, pretrain_sess, global_t, sample, args.addall, args.max_ep_step, args.nstep_bc, args.update_in_rollout) diff_global_t, episode_end, part_end, local_rollout_ctr, \ local_rollout_added_ctr, add, local_rollout_new_return, \ local_rollout_old_return = train_out th_ctr.put(1) if rollout_ctr % 20 == 0 and rollout_ctr > 0: log_msg = "ROLLOUT: rollout_ctr={} added_rollout_ct={} worker={}".format( rollout_ctr, rollout_added_ctr, parallel_worker.thread_idx) logger.info(log_msg) logger.info("ROLLOUT Gnew: {}, G: {}".format(local_rollout_new_return, local_rollout_old_return)) # should always part_end, i.e., end of episode # and only add if new return is better (if not LiDER-AddAll) if part_end and add: if not args.onebuffer: # directly put into Buffer R rollout_buffer.extend(parallel_worker.episode) else: # Buffer D add sample is centralized when OneBuffer ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # a3c training thread worker else: th_ctr.get() train_out = parallel_worker.train(sess, global_t, rewards) diff_global_t, episode_end, part_end = train_out th_ctr.put(1) if args.use_sil: net_updates.put(1) if part_end: ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 # ensure only one thread is updating global_t at a time with lock: global_t += diff_global_t # centralize increasing count for SIL and Rollout sil_ctr += local_sil_ctr sil_a3c_sampled += local_sil_a3c_sampled sil_a3c_used += local_sil_a3c_used sil_a3c_used_return += local_sil_a3c_used_return sil_rollout_sampled += local_sil_rollout_sampled sil_rollout_used += local_sil_rollout_used sil_rollout_used_return += local_sil_rollout_used_return sil_old_used += local_sil_old_used rollout_ctr += local_rollout_ctr rollout_added_ctr += local_rollout_added_ctr rollout_new_return += local_rollout_new_return rollout_old_return += local_rollout_old_return # if during a thread's update, global_t has reached a evaluation interval if global_t > next_global_t: next_global_t = next_t(global_t, args.eval_freq) step_t = int(next_global_t - args.eval_freq) # wait for all threads to be done before testing while not stop_req and th_ctr.qsize() < len(all_workers): time.sleep(0.001) step_t = int(next_global_t - args.eval_freq) # Evaluate for 125,000 steps rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, step_t, folder, worker=all_workers[-1]) save_best_model(rewards['eval'][step_t][0]) last_reward = rewards['eval'][step_t][0] # saving worker info to dicts # SIL sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used # ROLLOUT rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # save ckpt after done with eval if global_t > next_save_t: next_save_t = next_t(global_t, args.eval_freq*args.checkpoint_freq) # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works for priority mem for now) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saved shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saved rollout_buffer') # save a3c after saving buffer -- in case saving buffer OOM # so that at least we can revert back to the previous ckpt checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t, write_meta_graph=False) logger.info('Saved model ckpt') # if everything saves okay, clean up previous ckpt to save space remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Removed ckpt from step {}'.format(prev_ckpt_t)) prev_ckpt_t = global_t def signal_handler(signal, frame): nonlocal stop_req logger.info('You pressed Ctrl+C!') stop_req = True if stop_req and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward if test_reward > best_model_reward: best_model_reward = test_reward best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('w') as f: f.write(str(best_model_reward)) best_checkpt_file = folder / 'model_best' best_checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) best_saver.save(sess, str(best_checkpt_file)) train_threads = [] th_ctr = Queue() for i in range(args.parallel_size): th_ctr.put(1) episodes_queue = None net_updates = None if args.use_sil: episodes_queue = Queue() net_updates = Queue() for i in range(args.parallel_size): worker_thread = Thread( target=train_function, args=(i, th_ctr, episodes_queue, net_updates,)) train_threads.append(worker_thread) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder / 'wall_t.{}'.format(global_t) with wall_t_fname.open('w') as f: f.write(str(wall_t)) # save final model checkpoint_file = str(folder / '{}_checkpoint_a3c'.format(GYM_ENV_NAME)) root_saver.save(sess, checkpoint_file, global_step=global_t) dump_final_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname]) logger.info('Data saved!') # if everything saves okay & is done training (not because of pressed Ctrl+C), # clean up previous ckpt to save space if global_t >= (args.max_time_step * args.max_time_step_fraction): remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Done training, removed ckpt from step {}'.format(prev_ckpt_t)) sess.close() if pretrain_sess: pretrain_sess.close()
def extract_layers(args): ''' python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --extract-transfer-layers --use-mnih-2015 ''' os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf device = "/cpu:0" if args.model_folder is not None: model_folder = '{}_{}'.format(args.gym_env.replace('-', '_'), args.model_folder) else: model_folder = '{}_classifier'.format(args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_use_mnih' if args.use_lstm: end_str += '_use_lstm' model_folder += end_str logger.debug("Model folder:{}".format(model_folder)) if not os.path.exists(model_folder + '/transfer_model'): os.makedirs(model_folder + '/transfer_model') if not os.path.exists(model_folder + '/transfer_model/all'): os.makedirs(model_folder + '/transfer_model/all') if not os.path.exists(model_folder + '/transfer_model/nofc2'): os.makedirs(model_folder + '/transfer_model/nofc2') if not os.path.exists(model_folder + '/transfer_model/nofc1'): os.makedirs(model_folder + '/transfer_model/nofc1') if args.use_mnih_2015 and not os.path.exists(model_folder + '/transfer_model/noconv3'): os.makedirs(model_folder + '/transfer_model/noconv3') if not os.path.exists(model_folder + '/transfer_model/noconv2'): os.makedirs(model_folder + '/transfer_model/noconv2') game_state = GameState(env_id=args.gym_env) action_size = game_state.env.n_actions game_state.env.close() del game_state.env del game_state MultiClassNetwork.use_mnih_2015 = args.use_mnih_2015 network = MultiClassNetwork(action_size, -1, device) with tf.device(device): opt = tf.train.AdamOptimizer(learning_rate=0.0001, epsilon=0.001) # prepare session sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(model_folder) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) logger.info("Saving all layers...") transfer_params = tf.get_collection("transfer_params") transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, model_folder + '/transfer_model/all/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) logger.info("All layers saved") logger.info("Saving without fc2 layer...") # Remove fc2 weights for param in transfer_params[:]: logger.debug("\t{}".format(param.op.name)) if param.op.name == "net_-1/fc2_weights" or param.op.name == "net_-1/fc2_biases": transfer_params.remove(param) logger.debug("\t{} removed".format(param.op.name)) transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, model_folder + '/transfer_model/nofc2/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) logger.info("Without fc2 layer saved") logger.info("Saving without fc1 layer...") # Remove fc1 weights for param in transfer_params[:]: logger.debug("\t{}".format(param.op.name)) if param.op.name == "net_-1/fc1_weights" or param.op.name == "net_-1/fc1_biases": transfer_params.remove(param) logger.debug("\t{} removed".format(param.op.name)) transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, model_folder + '/transfer_model/nofc1/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) logger.info("Without fc1 layer saved") # Remove conv3 weights if args.use_mnih_2015: logger.info("Saving without conv3 layer...") for param in transfer_params[:]: logger.debug("\t{}".format(param.op.name)) if param.op.name == "net_-1/conv3_weights" or param.op.name == "net_-1/conv3_biases": transfer_params.remove(param) logger.debug("\t{} removed".format(param.op.name)) transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, model_folder + '/transfer_model/noconv3/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) logger.info("Without conv3 layer saved") logger.info("Saving without conv2 layer...") # Remove conv2 weights for param in transfer_params[:]: logger.debug("\t{}".format(param.op.name)) if param.op.name == "net_-1/conv2_weights" or param.op.name == "net_-1/conv2_biases": transfer_params.remove(param) logger.debug("\t{} removed".format(param.op.name)) transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save( sess, model_folder + '/transfer_model/noconv2/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) logger.info("Without conv2 layer saved") logger.info('Data saved!')
def classify_demo(args): ''' Multi-Class: python3 classification/run_experiment.py --gym-env=PongNoFrameskip-v4 --classify-demo --use-mnih-2015 --train-max-steps=150000 --batch_size=32 MTL One vs All: python3 classification/run_experiment.py --gym-env=PongNoFrameskip-v4 --classify-demo --onevsall-mtl --use-mnih-2015 --train-max-steps=150000 --batch_size=32 ''' if args.cpu_only: os.environ['CUDA_VISIBLE_DEVICES'] = '' else: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices import tensorflow as tf if args.cpu_only: device = "/cpu:0" gpu_options = None else: device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_fraction) config = tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=False) if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format(args.gym_env.replace('-', '_')) if args.model_folder is not None: model_folder = '{}_{}'.format(args.gym_env.replace('-', '_'), args.model_folder) else: model_folder = 'results/pretrain_models/{}'.format(args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.onevsall_mtl: end_str += '_onevsall_mtl' if args.exclude_noop: end_str += '_exclude_noop' if args.exclude_num_demo_ep > 0: end_str += '_exclude{}demoeps'.format(args.exclude_num_demo_ep) if args.exclude_k_steps_bad_state > 0: end_str += '_exclude{}badstate'.format(args.exclude_k_steps_bad_state) if args.l2_beta > 0: end_str += '_l2beta{:.0E}'.format(args.l2_beta) if args.l1_beta > 0: end_str += '_l1beta{:.0E}'.format(args.l1_beta) if args.weighted_cross_entropy: end_str += '_weighted_loss' if args.use_batch_proportion: end_str += '_batchprop' model_folder += end_str if args.append_experiment_num is not None: model_folder += '_' + args.append_experiment_num if not os.path.exists(model_folder + '/transfer_model'): os.makedirs(model_folder + '/transfer_model') os.makedirs(model_folder + '/transfer_model/all') os.makedirs(model_folder + '/transfer_model/nofc2') os.makedirs(model_folder + '/transfer_model/nofc1') if args.use_mnih_2015: os.makedirs(model_folder + '/transfer_model/noconv3') os.makedirs(model_folder + '/transfer_model/noconv2') os.makedirs(model_folder + '/model_best') if True: from common.util import LogFormatter fh = logging.FileHandler('{}/classify.log'.format(model_folder), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) logging.getLogger('atari_wrapper').addHandler(fh) logging.getLogger('network').addHandler(fh) logging.getLogger('deep_rl').addHandler(fh) logging.getLogger('replay_memory').addHandler(fh) game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n #game_state.env.close() #del game_state.env #del game_state if args.onevsall_mtl: from network import MTLBinaryClassNetwork MTLBinaryClassNetwork.use_mnih_2015 = args.use_mnih_2015 MTLBinaryClassNetwork.l1_beta = args.l1_beta MTLBinaryClassNetwork.l2_beta = args.l2_beta MTLBinaryClassNetwork.use_gpu = not args.cpu_only network = MTLBinaryClassNetwork(action_size, -1, device) else: from network import MultiClassNetwork MultiClassNetwork.use_mnih_2015 = args.use_mnih_2015 MultiClassNetwork.l1_beta = args.l1_beta MultiClassNetwork.l2_beta = args.l2_beta MultiClassNetwork.use_gpu = not args.cpu_only network = MultiClassNetwork(action_size, -1, device) logger.info("optimizer: RMSOptimizer") logger.info("\tlearning_rate: {}".format(args.learn_rate)) logger.info("\tdecay: {}".format(args.opt_alpha)) logger.info("\tepsilon: {}".format(args.opt_epsilon)) with tf.device(device): if args.onevsall_mtl: opt = [] for n_optimizer in range(action_size): opt.append(tf.train.RMSPropOptimizer( learning_rate=args.learn_rate, decay=args.opt_alpha, epsilon=args.opt_epsilon)) else: #opt = tf.train.AdamOptimizer(learning_rate=0.0001, epsilon=0.001) opt = tf.train.RMSPropOptimizer( learning_rate=args.learn_rate, decay=args.opt_alpha, epsilon=args.opt_epsilon) demo_ids = tuple(map(int, args.demo_ids.split(","))) classify_demo = ClassifyDemo( tf, network, args.gym_env, int(args.train_max_steps), args.batch_size, opt, eval_freq=args.eval_freq, demo_memory_folder=demo_memory_folder, demo_ids=demo_ids, folder=model_folder, exclude_num_demo_ep=args.exclude_num_demo_ep, use_onevsall=args.onevsall_mtl, weighted_cross_entropy=args.weighted_cross_entropy, device=device, clip_norm=args.grad_norm_clip, game_state=game_state, use_batch_proportion=args.use_batch_proportion) # prepare session sess = tf.Session(config=config, graph=network.graph) with network.graph.as_default(): init = tf.global_variables_initializer() sess.run(init) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(model_folder + '/log', sess.graph) # init or load checkpoint with saver with network.graph.as_default(): saver = tf.train.Saver() best_saver = tf.train.Saver(max_to_keep=1) def signal_handler(signal, frame): nonlocal classify_demo logger.info('You pressed Ctrl+C!') classify_demo.stop_requested = True signal.signal(signal.SIGINT, signal_handler) print ('Press Ctrl+C to stop') if args.onevsall_mtl: classify_demo.train_onevsall(sess, summary_op, summary_writer, exclude_noop=args.exclude_noop, exclude_bad_state_k=args.exclude_k_steps_bad_state, best_saver=best_saver) else: classify_demo.train(sess, summary_op, summary_writer, exclude_bad_state_k=args.exclude_k_steps_bad_state, best_saver=best_saver) logger.info('Now saving data. Please wait') saver.save(sess, model_folder + '/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_'))) with network.graph.as_default(): transfer_params = tf.get_collection("transfer_params") transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save(sess, model_folder + '/transfer_model/all/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) # Remove fc2 weights for param in transfer_params[:]: if param.op.name == "net_-1/fc2_weights" or param.op.name == "net_-1/fc2_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save(sess, model_folder + '/transfer_model/nofc2/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) # Remove fc1 weights for param in transfer_params[:]: if param.op.name == "net_-1/fc1_weights" or param.op.name == "net_-1/fc1_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save(sess, model_folder + '/transfer_model/nofc1/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) # Remove conv3 weights if args.use_mnih_2015: for param in transfer_params[:]: if param.op.name == "net_-1/conv3_weights" or param.op.name == "net_-1/conv3_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save(sess, model_folder + '/transfer_model/noconv3/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) # Remove conv2 weights for param in transfer_params[:]: if param.op.name == "net_-1/conv2_weights" or param.op.name == "net_-1/conv2_biases": transfer_params.remove(param) with network.graph.as_default(): transfer_saver = tf.train.Saver(transfer_params) transfer_saver.save(sess, model_folder + '/transfer_model/noconv2/' + '{}_transfer_params'.format(args.gym_env.replace('-', '_'))) with open(model_folder + '/transfer_model/max_output_value', 'w') as f_max_value: f_max_value.write(str(classify_demo.max_val)) logger.info('Data saved!') sess.close()
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device=None, pretrained_model=None, pretrained_model_sess=None, advice=False, reward_shaping=False): assert self.action_size != -1 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.use_pretrained_model_as_advice = advice self.use_pretrained_model_as_reward_shaping = reward_shaping logger.info("thread_index: {}".format(self.thread_index)) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("use_lstm: {}".format( colored(self.use_lstm, "green" if self.use_lstm else "red"))) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("finetune_upper_layers_only: {}".format( colored(self.finetune_upper_layers_only, "green" if self.finetune_upper_layers_only else "red"))) logger.info("use_pretrained_model_as_advice: {}".format( colored( self.use_pretrained_model_as_advice, "green" if self.use_pretrained_model_as_advice else "red"))) logger.info("use_pretrained_model_as_reward_shaping: {}".format( colored( self.use_pretrained_model_as_reward_shaping, "green" if self.use_pretrained_model_as_reward_shaping else "red"))) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) if self.use_lstm: GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACFFNetwork(self.action_size, thread_index, device) with tf.device(device): self.local_network.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) local_vars = self.local_network.get_vars if self.finetune_upper_layers_only: local_vars = self.local_network.get_vars_upper var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs) global_vars = global_network.get_vars if self.finetune_upper_layers_only: global_vars = global_network.get_vars_upper with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) #self.apply_gradients = grad_applier.apply_gradients( # global_vars(), # self.gradients) self.sync = self.local_network.sync_from( global_network, upper_layers_only=self.finetune_upper_layers_only) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=30, human_demo=False, episode_life=True) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 self.is_demo_thread = False with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped.get_action_meanings( ) self.local_network.build_grad_cam_grads() self.pretrained_model = pretrained_model self.pretrained_model_sess = pretrained_model_sess self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0 self.advice_ctr = 0 self.shaping_ctr = 0 self.last_rho = 0. if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping: assert self.pretrained_model is not None
class A3CTrainingThread(object): log_interval = 100 performance_log_interval = 1000 local_t_max = 20 demo_t_max = 20 use_lstm = False action_size = -1 entropy_beta = 0.01 demo_entropy_beta = 0.01 gamma = 0.99 use_mnih_2015 = False env_id = None reward_type = 'CLIP' # CLIP | LOG | RAW finetune_upper_layers_oinly = False shaping_reward = 0.001 shaping_factor = 1. shaping_gamma = 0.85 advice_confidence = 0.8 shaping_actions = -1 # -1 all actions, 0 exclude noop transformed_bellman = False clip_norm = 0.5 use_grad_cam = False def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device=None, pretrained_model=None, pretrained_model_sess=None, advice=False, reward_shaping=False): assert self.action_size != -1 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.use_pretrained_model_as_advice = advice self.use_pretrained_model_as_reward_shaping = reward_shaping logger.info("thread_index: {}".format(self.thread_index)) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("use_lstm: {}".format( colored(self.use_lstm, "green" if self.use_lstm else "red"))) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("finetune_upper_layers_only: {}".format( colored(self.finetune_upper_layers_only, "green" if self.finetune_upper_layers_only else "red"))) logger.info("use_pretrained_model_as_advice: {}".format( colored( self.use_pretrained_model_as_advice, "green" if self.use_pretrained_model_as_advice else "red"))) logger.info("use_pretrained_model_as_reward_shaping: {}".format( colored( self.use_pretrained_model_as_reward_shaping, "green" if self.use_pretrained_model_as_reward_shaping else "red"))) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) if self.use_lstm: GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACFFNetwork(self.action_size, thread_index, device) with tf.device(device): self.local_network.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) local_vars = self.local_network.get_vars if self.finetune_upper_layers_only: local_vars = self.local_network.get_vars_upper var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs) global_vars = global_network.get_vars if self.finetune_upper_layers_only: global_vars = global_network.get_vars_upper with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) #self.apply_gradients = grad_applier.apply_gradients( # global_vars(), # self.gradients) self.sync = self.local_network.sync_from( global_network, upper_layers_only=self.finetune_upper_layers_only) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=30, human_demo=False, episode_life=True) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 self.is_demo_thread = False with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped.get_action_meanings( ) self.local_network.build_grad_cam_grads() self.pretrained_model = pretrained_model self.pretrained_model_sess = pretrained_model_sess self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0 self.advice_ctr = 0 self.shaping_ctr = 0 self.last_rho = 0. if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping: assert self.pretrained_model is not None def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, logits): """sample() in https://github.com/ppyht2/tf-a2c/blob/master/src/policy.py""" noise = np.random.uniform(0, 1, np.shape(logits)) return np.argmax(logits - np.log(-np.log(noise))) def choose_action_with_high_confidence(self, pi_values, exclude_noop=True): actions_confidence = [] # exclude NOOP action for action in range(1 if exclude_noop else 0, self.action_size): actions_confidence.append(pi_values[action][0][0]) max_confidence_action = np.argmax(actions_confidence) confidence = actions_confidence[max_confidence_action] return (max_confidence_action + (1 if exclude_noop else 0)), confidence def set_summary_writer(self, writer): self.writer = writer def record_summary(self, score=0, steps=0, episodes=None, global_t=0, mode='Test'): summary = tf.Summary() summary.value.add(tag='{}/score'.format(mode), simple_value=float(score)) summary.value.add(tag='{}/steps'.format(mode), simple_value=float(steps)) if episodes is not None: summary.value.add(tag='{}/episodes'.format(mode), simple_value=float(episodes)) self.writer.add_summary(summary, global_t) self.writer.flush() def set_start_time(self, start_time): self.start_time = start_time def generate_cam(self, sess, test_cam_si, global_t): cam_side_img = [] for i in range(len(test_cam_si)): # get max action per demo state readout_t = self.local_network.run_policy(sess, test_cam_si[i]) action = np.argmax(readout_t) # convert action to one-hot vector action_onehot = [0.] * self.game_state.env.action_space.n action_onehot[action] = 1. # compute grad cam for conv layer 3 activations, gradients = self.local_network.evaluate_grad_cam( sess, test_cam_si[i], action_onehot) cam = grad_cam(activations, gradients) cam_img = visualize_cam(cam) side_by_side = generate_image_for_cam_video( test_cam_si[i], cam_img, global_t, i, self.action_meaning[action]) cam_side_img.append(side_by_side) return cam_side_img def generate_cam_video(self, sess, time_per_step, global_t, folder, demo_memory_cam, demo_cam_human=False): # use one demonstration data to record cam # only need to make movie for demo data once cam_side_img = self.generate_cam(sess, demo_memory_cam, global_t) path = '/frames/demo-cam_side_img' if demo_cam_human: path += '_human' make_movie(cam_side_img, folder + '{}{ep:010d}'.format(path, ep=(global_t)), duration=len(cam_side_img) * time_per_step, true_image=True, salience=False) del cam_side_img def testing_model(self, sess, max_steps, global_t, folder, demo_memory_cam=None, demo_cam_human=False): logger.info("Testing model at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam, demo_cam_human) return else: self.game_state.reset(hard_reset=True) max_steps += 4 test_memory = ReplayMemory( 84, 84, np.random.RandomState(), max_steps=max_steps, phi_length=4, num_actions=self.game_state.env.action_space.n, wrap_memory=False, full_state_size=self.game_state.clone_full_state().shape[0]) for _ in range(4): test_memory.add(self.game_state.x_t, 0, self.game_state.reward, self.game_state.terminal, self.game_state.lives, fullstate=self.game_state.full_state) episode_buffer = [] test_memory_cam = [] total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 terminal = False while True: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) test_memory_cam.append(self.game_state.s_t) episode_buffer.append(self.game_state.get_screen_rgb()) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) #action = self.choose_action(logits_) action = np.argmax(pi_) # take action self.game_state.step(action) terminal = self.game_state.terminal memory_full = episode_steps == max_steps - 5 terminal_ = terminal or memory_full # store the transition to replay memory test_memory.add(self.game_state.x_t1, action, self.game_state.reward, terminal_, self.game_state.lives, fullstate=self.game_state.full_state1) # update the old values episode_reward += self.game_state.reward episode_steps += 1 # s_t = s_t1 self.game_state.update() if terminal_: if get_wrapper_by_name( self.game_state.env, 'EpisodicLifeEnv').was_real_done or memory_full: time_per_step = 0.03 images = np.array(episode_buffer) make_movie(images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) break self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() total_reward = episode_reward total_steps = episode_steps log_data = (global_t, self.thread_index, total_reward, total_steps) logger.info( "test: global_t={} worker={} final score={} final steps={}".format( *log_data)) self.generate_cam_video(sess, 0.03, global_t, folder, np.array(test_memory_cam)) test_memory.save(name='test_cam', folder=folder, resize=True) if self.use_lstm: self.local_network.reset_state() return def testing(self, sess, max_steps, global_t, folder, demo_memory_cam=None): logger.info("Evaluate policy at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None and global_t % 5000000 == 0: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam) episode_buffer = [] self.game_state.reset(hard_reset=True) episode_buffer.append(self.game_state.get_screen_rgb()) total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) if False: action = np.random.choice(range(self.action_size), p=pi_) else: action = self.choose_action(logits_) if self.use_pretrained_model_as_advice: psi = self.psi if self.psi > 0.001 else 0.0 if psi > np.random.rand(): model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) model_action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) if model_action > self.shaping_actions and confidence >= self.advice_confidence: action = model_action # take action self.game_state.step(action) terminal = self.game_state.terminal if n_episodes == 0 and global_t % 5000000 == 0: episode_buffer.append(self.game_state.get_screen_rgb()) episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: if n_episodes == 0 and global_t % 5000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) make_movie( images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (global_t, self.thread_index, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} worker={} trial={} {} {} total_steps={}" .format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (global_t, self.thread_index, total_reward, total_steps, n_episodes) logger.info( "test: global_t={} worker={} final score={} final steps={} # trials={}" .format(*log_data)) self.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=global_t, mode='Test') # reset variables used in training self.episode_reward = 0 self.episode_steps = 0 self.game_state.reset(hard_reset=True) self.last_rho = 0. if self.is_demo_thread: self.replay_mem_reset() if self.use_lstm: self.local_network.reset_state() return total_reward, total_steps, n_episodes def pretrain_init(self, demo_memory): self.demo_memory_size = len(demo_memory) self.demo_memory = demo_memory self.replay_mem_reset() def replay_mem_reset(self, demo_memory_idx=None): if demo_memory_idx is not None: self.demo_memory_idx = demo_memory_idx else: # new random episode self.demo_memory_idx = np.random.randint(0, self.demo_memory_size) self.demo_memory_count = np.random.randint( 0, len(self.demo_memory[self.demo_memory_idx]) - self.local_t_max) # if self.demo_memory_count+self.local_t_max < len(self.demo_memory[self.demo_memory_idx]): # self.demo_memory_max_count = np.random.randint(self.demo_memory_count+self.local_t_max, len(self.demo_memory[self.demo_memory_idx])) # else: # self.demo_memory_max_count = len(self.demo_memory[self.demo_memory_idx]) logger.debug( "worker={} mem_reset demo_memory_idx={} demo_memory_start={}". format(self.thread_index, self.demo_memory_idx, self.demo_memory_count)) s_t, action, reward, terminal = self.demo_memory[self.demo_memory_idx][ self.demo_memory_count] self.demo_memory_action = action self.demo_memory_reward = reward self.demo_memory_terminal = terminal if not self.demo_memory[self.demo_memory_idx].imgs_normalized: self.demo_memory_s_t = s_t * (1.0 / 255.0) else: self.demo_memory_s_t = s_t def replay_mem_process(self): self.demo_memory_count += 1 s_t, action, reward, terminal = self.demo_memory[self.demo_memory_idx][ self.demo_memory_count] self.demo_memory_next_action = action self.demo_memory_reward = reward self.demo_memory_terminal = terminal if not self.demo_memory[self.demo_memory_idx].imgs_normalized: self.demo_memory_s_t1 = s_t * (1.0 / 255.0) else: self.demo_memory_s_t1 = s_t def replay_mem_update(self): self.demo_memory_action = self.demo_memory_next_action self.demo_memory_s_t = self.demo_memory_s_t1 def demo_process(self, sess, global_t, demo_memory_idx=None): states = [] actions = [] rewards = [] values = [] demo_ended = False terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if self.use_lstm: reset_lstm_state = False start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.demo_t_max): pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.demo_memory_s_t) action = self.demo_memory_action time.sleep(0.0025) states.append(self.demo_memory_s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % self.log_interval == 0): log_msg = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg += " pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg += " V={:.4f}".format(value_) logger.debug(log_msg) # process replay memory self.replay_mem_process() # receive replay memory result reward = self.demo_memory_reward terminal = self.demo_memory_terminal self.episode_reward += reward if self.reward_type == 'LOG': reward = np.sign(reward) * np.log(1 + np.abs(reward)) elif self.reward_type == 'CLIP': # clip reward reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 # demo_memory_s_t1 -> demo_memory_s_t self.replay_mem_update() s_t = self.demo_memory_s_t if terminal or self.demo_memory_count == len( self.demo_memory[self.demo_memory_idx]): logger.debug("worker={} score={}".format( self.thread_index, self.episode_reward)) demo_ended = True if terminal: terminal_end = True if self.use_lstm: self.local_network.reset_state() else: # some demo episodes doesn't reach terminal state if self.use_lstm: reset_lstm_state = True self.episode_reward = 0 self.episode_steps = 0 self.replay_mem_reset(demo_memory_idx=demo_memory_idx) break cumulative_reward = 0.0 if not terminal_end: cumulative_reward = self.local_network.run_value(sess, s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumulative_reward = [] # compute and accmulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): cumulative_reward = ri + self.gamma * cumulative_reward advantage = cumulative_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) cur_learning_rate = self._anneal_learning_rate(global_t) #* 0.005 if self.use_lstm: batch_state.reverse() batch_action.reverse() batch_adv.reverse() batch_cumulative_reward.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_action)], self.learning_rate_input: cur_learning_rate }) # some demo episodes doesn't reach terminal state if reset_lstm_state: self.local_network.reset_state() reset_lstm_state = False else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_R, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= self.performance_log_interval): self.prev_local_t += self.performance_log_interval # return advancd local step size diff_local_t = self.local_t - start_local_t return diff_local_t, demo_ended def process(self, sess, global_t, train_rewards): states = [] actions = [] rewards = [] values = [] rho = [] terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if self.use_lstm: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.local_t_max): pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(logits_) model_pi = None confidence = 0. if self.use_pretrained_model_as_advice: self.psi = 0.9999 * ( 0.9999** global_t) if self.psi > 0.001 else 0.0 # 0.99995 works if self.psi > np.random.rand(): model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) model_action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) if (model_action > self.shaping_actions and confidence >= self.advice_confidence): action = model_action self.advice_ctr += 1 if self.use_pretrained_model_as_reward_shaping: #if action > 0: if model_pi is None: model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) confidence = model_pi[action][0][0] if (action > self.shaping_actions and confidence >= self.advice_confidence): #rho.append(round(confidence, 5)) rho.append(self.shaping_reward) self.shaping_ctr += 1 else: rho.append(0.) #self.shaping_ctr += 1 states.append(self.game_state.s_t) actions.append(action) values.append(value_) if self.thread_index == 0 and self.local_t % self.log_interval == 0: log_msg1 = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg2 = "pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg3 = "V={:.4f}".format(value_) if self.use_pretrained_model_as_advice: log_msg3 += " psi={:.4f}".format(self.psi) logger.debug(log_msg1) logger.debug(log_msg2) logger.debug(log_msg3) # process game self.game_state.step(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal if self.use_pretrained_model_as_reward_shaping: if reward < 0 and reward > 0: rho[i] = 0. j = i - 1 while j > i - 5: if rewards[j] != 0: break rho[j] = 0. j -= 1 # if self.game_state.loss_life: # if self.game_state.gain_life or reward > 0: # rho[i] = 0. # j = i-1 # k = 1 # while j >= 0: # if rewards[j] != 0: # rho[j] = self.shaping_reward * (self.gamma ** -1) # break # rho[j] = self.shaping_reward / k # j -= 1 # k += 1 self.episode_reward += reward if self.reward_type == 'LOG': reward = np.sign(reward) * np.log(1 + np.abs(reward)) elif self.reward_type == 'CLIP': # clip reward reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: log_msg = "train: worker={} global_t={}".format( self.thread_index, global_t) if self.use_pretrained_model_as_advice: log_msg += " advice_ctr={}".format(self.advice_ctr) if self.use_pretrained_model_as_reward_shaping: log_msg += " shaping_ctr={}".format(self.shaping_ctr) score_str = colored("score={}".format(self.episode_reward), "magenta") steps_str = colored("steps={}".format(self.episode_steps), "blue") log_msg += " {} {}".format(score_str, steps_str) logger.debug(log_msg) train_rewards['train'][global_t] = (self.episode_reward, self.episode_steps) self.record_summary(score=self.episode_reward, steps=self.episode_steps, episodes=None, global_t=global_t, mode='Train') self.episode_reward = 0 self.episode_steps = 0 terminal_end = True self.last_rho = 0. if self.use_lstm: self.local_network.reset_state() self.game_state.reset(hard_reset=False) break cumulative_reward = 0.0 if not terminal: cumulative_reward = self.local_network.run_value( sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumulative_reward = [] if self.use_pretrained_model_as_reward_shaping: rho.reverse() rho.append(self.last_rho) self.last_rho = rho[0] i = 0 # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): # Wiewiora et al.(2003) Principled Methods for Advising RL agents # Look-Back Advice #F = rho[i] - (self.shaping_gamma**-1) * rho[i+1] #F = rho[i] - self.shaping_gamma * rho[i+1] f = (self.shaping_gamma**-1) * rho[i] - rho[i + 1] if (i == 0 and terminal) or (f != 0 and (ri > 0 or ri < 0)): #logger.warn("averted additional F in absorbing state") F = 0. # if (F < 0. and ri > 0) or (F > 0. and ri < 0): # logger.warn("Negative reward shaping F={} ri={} rho[s]={} rhos[s-1]={}".format(F, ri, rho[i], rho[i+1])) # F = 0. cumulative_reward = (ri + f * self.shaping_factor ) + self.gamma * cumulative_reward advantage = cumulative_reward - vi a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) i += 1 else: def h(z, eps=10**-2): return (np.sign(z) * (np.sqrt(np.abs(z) + 1.) - 1.)) + (eps * z) def h_inv(z, eps=10**-2): return np.sign(z) * (np.square( (np.sqrt(1 + 4 * eps * (np.abs(z) + 1 + eps)) - 1) / (2 * eps)) - 1) def h_log(z, eps=.6): return (np.sign(z) * np.log(1. + np.abs(z)) * eps) def h_inv_log(z, eps=.6): return np.sign(z) * (np.exp(np.abs(z) / eps) - 1) # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: cumulative_reward = h(ri + self.gamma * h_inv(cumulative_reward)) else: cumulative_reward = ri + self.gamma * cumulative_reward advantage = cumulative_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) cur_learning_rate = self._anneal_learning_rate(global_t) if self.use_lstm: batch_state.reverse() batch_action.reverse() batch_adv.reverse() batch_cumulative_reward.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_action)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= self.performance_log_interval): self.prev_local_t += self.performance_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time logger.info( "Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end
def __init__(self, thread_index, global_net, local_net, initial_learning_rate, learning_rate_input, grad_applier, device=None, no_op_max=30): """Initialize A3CTrainingThread class.""" assert self.action_size != -1 self.is_sil_thread = False self.is_refresh_thread = False self.thread_idx = thread_index self.learning_rate_input = learning_rate_input self.local_net = local_net self.no_op_max = no_op_max self.override_num_noops = 0 if self.no_op_max == 0 else None logger.info("===A3C thread_index: {}===".format(self.thread_idx)) logger.info("device: {}".format(device)) logger.info("use_sil: {}".format( colored(self.use_sil, "green" if self.use_sil else "red"))) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) reward_clipped = True if self.reward_type == 'CLIP' else False local_vars = self.local_net.get_vars with tf.device(device): self.local_net.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_net.total_loss, var_refs) global_vars = global_net.get_vars with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) self.sync = self.local_net.sync_from(global_net) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=self.no_op_max, human_demo=False, episode_life=True, override_num_noops=self.override_num_noops) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped \ .get_action_meanings() self.local_net.build_grad_cam_grads() if self.use_sil: self.episode = SILReplayMemory( self.action_size, max_len=None, gamma=self.gamma, clip=reward_clipped, height=self.local_net.in_shape[0], width=self.local_net.in_shape[1], phi_length=self.local_net.in_shape[2], reward_constant=self.reward_constant)
class A3CTrainingThread(CommonWorker): """Asynchronous Actor-Critic Training Thread Class.""" log_interval = 100 perf_log_interval = 1000 local_t_max = 20 entropy_beta = 0.01 gamma = 0.99 shaping_actions = -1 # -1 all actions, 0 exclude noop transformed_bellman = False clip_norm = 0.5 use_grad_cam = False use_sil = False log_idx = 0 reward_constant = 0 def __init__(self, thread_index, global_net, local_net, initial_learning_rate, learning_rate_input, grad_applier, device=None, no_op_max=30): """Initialize A3CTrainingThread class.""" assert self.action_size != -1 self.is_sil_thread = False self.is_refresh_thread = False self.thread_idx = thread_index self.learning_rate_input = learning_rate_input self.local_net = local_net self.no_op_max = no_op_max self.override_num_noops = 0 if self.no_op_max == 0 else None logger.info("===A3C thread_index: {}===".format(self.thread_idx)) logger.info("device: {}".format(device)) logger.info("use_sil: {}".format( colored(self.use_sil, "green" if self.use_sil else "red"))) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) reward_clipped = True if self.reward_type == 'CLIP' else False local_vars = self.local_net.get_vars with tf.device(device): self.local_net.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_net.total_loss, var_refs) global_vars = global_net.get_vars with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) self.sync = self.local_net.sync_from(global_net) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=self.no_op_max, human_demo=False, episode_life=True, override_num_noops=self.override_num_noops) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped \ .get_action_meanings() self.local_net.build_grad_cam_grads() if self.use_sil: self.episode = SILReplayMemory( self.action_size, max_len=None, gamma=self.gamma, clip=reward_clipped, height=self.local_net.in_shape[0], width=self.local_net.in_shape[1], phi_length=self.local_net.in_shape[2], reward_constant=self.reward_constant) def train(self, sess, global_t, train_rewards): """Train A3C.""" states = [] fullstates = [] actions = [] rewards = [] values = [] rho = [] terminal_pseudo = False # loss of life terminal_end = False # real terminal # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t # t_max times loop for i in range(self.local_t_max): state = cv2.resize(self.game_state.s_t, self.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) fullstate = self.game_state.clone_full_state() pi_, value_, logits_ = self.local_net.run_policy_and_value( sess, state) action = self.pick_action(logits_) states.append(state) fullstates.append(fullstate) actions.append(action) values.append(value_) if self.thread_idx == self.log_idx \ and self.local_t % self.log_interval == 0: log_msg1 = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg2 = "pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg3 = "V={:.4f}".format(value_) logger.debug(log_msg1) logger.debug(log_msg2) logger.debug(log_msg3) # process game self.game_state.step(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward if self.use_sil: # save states in episode memory self.episode.add_item(self.game_state.s_t, fullstate, action, reward, terminal) if self.reward_type == 'CLIP': reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_pseudo = True env = self.game_state.env name = 'EpisodicLifeEnv' if get_wrapper_by_name(env, name).was_real_done: # reduce log freq if self.thread_idx == self.log_idx: log_msg = "train: worker={} global_t={} local_t={}".format( self.thread_idx, global_t, self.local_t) score_str = colored( "score={}".format(self.episode_reward), "magenta") steps_str = colored( "steps={}".format(self.episode_steps), "blue") log_msg += " {} {}".format(score_str, steps_str) logger.debug(log_msg) train_rewards['train'][global_t] = (self.episode_reward, self.episode_steps) self.record_summary(score=self.episode_reward, steps=self.episode_steps, episodes=None, global_t=global_t, mode='Train') self.episode_reward = 0 self.episode_steps = 0 terminal_end = True self.game_state.reset(hard_reset=False) break cumsum_reward = 0.0 if not terminal: state = cv2.resize(self.game_state.s_t, self.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) cumsum_reward = self.local_net.run_value(sess, state) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumsum_reward = [] # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: ri = np.sign(ri) * self.reward_constant + ri cumsum_reward = transform_h(ri + self.gamma * transform_h_inv(cumsum_reward)) else: cumsum_reward = ri + self.gamma * cumsum_reward advantage = cumsum_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumsum_reward.append(cumsum_reward) cur_learning_rate = self._anneal_learning_rate( global_t, self.initial_learning_rate) feed_dict = { self.local_net.s: batch_state, self.local_net.a: batch_action, self.local_net.advantage: batch_adv, self.local_net.cumulative_reward: batch_cumsum_reward, self.learning_rate_input: cur_learning_rate, } sess.run(self.apply_gradients, feed_dict=feed_dict) t = self.local_t - self.prev_local_t if (self.thread_idx == self.log_idx and t >= self.perf_log_interval): self.prev_local_t += self.perf_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time logger.info("worker-{}, log_worker-{}".format( self.thread_idx, self.log_idx)) logger.info("Performance : {} STEPS in {:.0f} sec. {:.0f}" " STEPS/sec. {:.2f}M STEPS/hour.".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end, terminal_pseudo
def __init__(self, thread_index, action_size, env_id, global_a3c, local_a3c, update_in_rollout, nstep_bc, global_pretrained_model, local_pretrained_model, transformed_bellman=False, no_op_max=0, device='/cpu:0', entropy_beta=0.01, clip_norm=None, grad_applier=None, initial_learn_rate=0.007, learning_rate_input=None): """Initialize RolloutThread class.""" self.is_refresh_thread = True self.action_size = action_size self.thread_idx = thread_index self.transformed_bellman = transformed_bellman self.entropy_beta = entropy_beta self.clip_norm = clip_norm self.initial_learning_rate = initial_learn_rate self.learning_rate_input = learning_rate_input self.no_op_max = no_op_max self.override_num_noops = 0 if self.no_op_max == 0 else None logger.info("===REFRESH thread_index: {}===".format(self.thread_idx)) logger.info("device: {}".format(device)) logger.info("action_size: {}".format(self.action_size)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("update in rollout: {}".format( colored(update_in_rollout, "green" if update_in_rollout else "red"))) logger.info("N-step BC: {}".format(nstep_bc)) self.reward_clipped = True if self.reward_type == 'CLIP' else False # setup local a3c self.local_a3c = local_a3c self.sync_a3c = self.local_a3c.sync_from(global_a3c) with tf.device(device): local_vars = self.local_a3c.get_vars self.local_a3c.prepare_loss( entropy_beta=self.entropy_beta, critic_lr=0.5) var_refs = [v._ref() for v in local_vars()] self.rollout_gradients = tf.gradients(self.local_a3c.total_loss, var_refs) global_vars = global_a3c.get_vars if self.clip_norm is not None: self.rollout_gradients, grad_norm = tf.clip_by_global_norm( self.rollout_gradients, self.clip_norm) self.rollout_gradients = list(zip(self.rollout_gradients, global_vars())) self.rollout_apply_gradients = grad_applier.apply_gradients(self.rollout_gradients) # setup local pretrained model self.local_pretrained = None if nstep_bc > 0: assert local_pretrained_model is not None assert global_pretrained_model is not None self.local_pretrained = local_pretrained_model self.sync_pretrained = self.local_pretrained.sync_from(global_pretrained_model) # setup env self.rolloutgame = GameState(env_id=env_id, display=False, no_op_max=0, human_demo=False, episode_life=True, override_num_noops=0) self.local_t = 0 self.episode_reward = 0 self.episode_steps = 0 self.action_meaning = self.rolloutgame.env.unwrapped.get_action_meanings() assert self.local_a3c is not None if nstep_bc > 0: assert self.local_pretrained is not None self.episode = SILReplayMemory( self.action_size, max_len=None, gamma=self.gamma, clip=self.reward_clipped, height=self.local_a3c.in_shape[0], width=self.local_a3c.in_shape[1], phi_length=self.local_a3c.in_shape[2], reward_constant=self.reward_constant)
class RefreshThread(CommonWorker): """Rollout Thread Class.""" advice_confidence = 0.8 gamma = 0.99 def __init__(self, thread_index, action_size, env_id, global_a3c, local_a3c, update_in_rollout, nstep_bc, global_pretrained_model, local_pretrained_model, transformed_bellman=False, no_op_max=0, device='/cpu:0', entropy_beta=0.01, clip_norm=None, grad_applier=None, initial_learn_rate=0.007, learning_rate_input=None): """Initialize RolloutThread class.""" self.is_refresh_thread = True self.action_size = action_size self.thread_idx = thread_index self.transformed_bellman = transformed_bellman self.entropy_beta = entropy_beta self.clip_norm = clip_norm self.initial_learning_rate = initial_learn_rate self.learning_rate_input = learning_rate_input self.no_op_max = no_op_max self.override_num_noops = 0 if self.no_op_max == 0 else None logger.info("===REFRESH thread_index: {}===".format(self.thread_idx)) logger.info("device: {}".format(device)) logger.info("action_size: {}".format(self.action_size)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("update in rollout: {}".format( colored(update_in_rollout, "green" if update_in_rollout else "red"))) logger.info("N-step BC: {}".format(nstep_bc)) self.reward_clipped = True if self.reward_type == 'CLIP' else False # setup local a3c self.local_a3c = local_a3c self.sync_a3c = self.local_a3c.sync_from(global_a3c) with tf.device(device): local_vars = self.local_a3c.get_vars self.local_a3c.prepare_loss( entropy_beta=self.entropy_beta, critic_lr=0.5) var_refs = [v._ref() for v in local_vars()] self.rollout_gradients = tf.gradients(self.local_a3c.total_loss, var_refs) global_vars = global_a3c.get_vars if self.clip_norm is not None: self.rollout_gradients, grad_norm = tf.clip_by_global_norm( self.rollout_gradients, self.clip_norm) self.rollout_gradients = list(zip(self.rollout_gradients, global_vars())) self.rollout_apply_gradients = grad_applier.apply_gradients(self.rollout_gradients) # setup local pretrained model self.local_pretrained = None if nstep_bc > 0: assert local_pretrained_model is not None assert global_pretrained_model is not None self.local_pretrained = local_pretrained_model self.sync_pretrained = self.local_pretrained.sync_from(global_pretrained_model) # setup env self.rolloutgame = GameState(env_id=env_id, display=False, no_op_max=0, human_demo=False, episode_life=True, override_num_noops=0) self.local_t = 0 self.episode_reward = 0 self.episode_steps = 0 self.action_meaning = self.rolloutgame.env.unwrapped.get_action_meanings() assert self.local_a3c is not None if nstep_bc > 0: assert self.local_pretrained is not None self.episode = SILReplayMemory( self.action_size, max_len=None, gamma=self.gamma, clip=self.reward_clipped, height=self.local_a3c.in_shape[0], width=self.local_a3c.in_shape[1], phi_length=self.local_a3c.in_shape[2], reward_constant=self.reward_constant) def record_rollout(self, score=0, steps=0, old_return=0, new_return=0, global_t=0, rollout_ctr=0, rollout_added_ctr=0, mode='Rollout', confidence=None, episodes=None): """Record rollout summary.""" summary = tf.Summary() summary.value.add(tag='{}/score'.format(mode), simple_value=float(score)) summary.value.add(tag='{}/old_return_from_s'.format(mode), simple_value=float(old_return)) summary.value.add(tag='{}/new_return_from_s'.format(mode), simple_value=float(new_return)) summary.value.add(tag='{}/steps'.format(mode), simple_value=float(steps)) summary.value.add(tag='{}/all_rollout_ctr'.format(mode), simple_value=float(rollout_ctr)) summary.value.add(tag='{}/rollout_added_ctr'.format(mode), simple_value=float(rollout_added_ctr)) if confidence is not None: summary.value.add(tag='{}/advice-confidence'.format(mode), simple_value=float(confidence)) if episodes is not None: summary.value.add(tag='{}/episodes'.format(mode), simple_value=float(episodes)) self.writer.add_summary(summary, global_t) self.writer.flush() def compute_return_for_state(self, rewards, terminal): """Compute expected return.""" length = np.shape(rewards)[0] returns = np.empty_like(rewards, dtype=np.float32) if self.reward_clipped: rewards = np.clip(rewards, -1., 1.) else: rewards = np.sign(rewards) * self.reward_constant + rewards for i in reversed(range(length)): if terminal[i]: returns[i] = rewards[i] if self.reward_clipped else transform_h(rewards[i]) else: if self.reward_clipped: returns[i] = rewards[i] + self.gamma * returns[i+1] else: # apply transformed expected return exp_r_t = self.gamma * transform_h_inv(returns[i+1]) returns[i] = transform_h(rewards[i] + exp_r_t) return returns[0] def update_a3c(self, sess, actions, states, rewards, values, global_t): cumsum_reward = 0.0 actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumsum_reward = [] # compute and accumulate gradients for(ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: ri = np.sign(ri) * self.reward_constant + ri cumsum_reward = transform_h( ri + self.gamma * transform_h_inv(cumsum_reward)) else: cumsum_reward = ri + self.gamma * cumsum_reward advantage = cumsum_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumsum_reward.append(cumsum_reward) cur_learning_rate = self._anneal_learning_rate(global_t, self.initial_learning_rate ) feed_dict = { self.local_a3c.s: batch_state, self.local_a3c.a: batch_action, self.local_a3c.advantage: batch_adv, self.local_a3c.cumulative_reward: batch_cumsum_reward, self.learning_rate_input: cur_learning_rate, } sess.run(self.rollout_apply_gradients, feed_dict=feed_dict) return batch_adv def rollout(self, a3c_sess, folder, pretrain_sess, global_t, past_state, add_all_rollout, ep_max_steps, nstep_bc, update_in_rollout): """Perform one rollout until terminal.""" a3c_sess.run(self.sync_a3c) if nstep_bc > 0: pretrain_sess.run(self.sync_pretrained) _, fs, old_a, old_return, _, _ = past_state states = [] actions = [] rewards = [] values = [] terminals = [] confidences = [] rollout_ctr, rollout_added_ctr = 0, 0 rollout_new_return, rollout_old_return = 0, 0 terminal_pseudo = False # loss of life terminal_end = False # real terminal add = False self.rolloutgame.reset(hard_reset=True) self.rolloutgame.restore_full_state(fs) # check if restore successful fs_check = self.rolloutgame.clone_full_state() assert fs_check.all() == fs.all() del fs_check start_local_t = self.local_t self.rolloutgame.step(0) # prevent rollout too long, set max_ep_steps to be lower than ALE default # see https://github.com/openai/gym/blob/54f22cf4db2e43063093a1b15d968a57a32b6e90/gym/envs/__init__.py#L635 # but in all games tested, no rollout exceeds ep_max_steps while ep_max_steps > 0: state = cv2.resize(self.rolloutgame.s_t, self.local_a3c.in_shape[:-1], interpolation=cv2.INTER_AREA) fullstate = self.rolloutgame.clone_full_state() if nstep_bc > 0: # LiDER-TA or BC model_pi = self.local_pretrained.run_policy(pretrain_sess, state) action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) confidences.append(confidence) # not using "confidences" for anything nstep_bc -= 1 else: # LiDER, refresh with current policy pi_, _, logits_ = self.local_a3c.run_policy_and_value(a3c_sess, state) action = self.pick_action(logits_) confidences.append(pi_[action]) value_ = self.local_a3c.run_value(a3c_sess, state) values.append(value_) states.append(state) actions.append(action) self.rolloutgame.step(action) ep_max_steps -= 1 reward = self.rolloutgame.reward terminal = self.rolloutgame.terminal terminals.append(terminal) self.episode_reward += reward self.episode.add_item(self.rolloutgame.s_t, fullstate, action, reward, terminal, from_rollout=True) if self.reward_type == 'CLIP': reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 self.rolloutgame.update() if terminal: terminal_pseudo = True env = self.rolloutgame.env name = 'EpisodicLifeEnv' rollout_ctr += 1 terminal_end = get_wrapper_by_name(env, name).was_real_done new_return = self.compute_return_for_state(rewards, terminals) if not add_all_rollout: if new_return > old_return: add = True else: add = True if add: rollout_added_ctr += 1 rollout_new_return += new_return rollout_old_return += old_return # update policy immediate using a good rollout if update_in_rollout: batch_adv = self.update_a3c(a3c_sess, actions, states, rewards, values, global_t) self.episode_reward = 0 self.episode_steps = 0 self.rolloutgame.reset(hard_reset=True) break diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end, terminal_pseudo, rollout_ctr, \ rollout_added_ctr, add, rollout_new_return, rollout_old_return
def run_dqn(args): """ Baseline: python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --cuda-devices=0 --optimizer=Adam --lr=0.0001 --decay=0.0 --momentum=0.0 --epsilon=0.001 --gpu-fraction=0.222 python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --cuda-devices=0 --optimizer=RMS --lr=0.00025 --decay=0.95 --momentum=0.0 --epsilon=0.00001 --gpu-fraction=0.222 Transfer with Human Memory: python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --cuda-devices=0 --optimizer=Adam --lr=0.0001 --decay=0.0 --momentum=0.0 --epsilon=0.001 --observe=0 --use-transfer --load-memory python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --cuda-devices=0 --optimizer=RMS --lr=0.00025 --decay=0.95 --momentum=0.0 --epsilon=0.00001 --observe=0 --use-transfer --load-memory python3 run_experiment.py breakout --cuda-devices=0 --optimizer=RMS --lr=0.00025 --decay=0.95 --momentum=0.0 --epsilon=0.01 --observe=0 --use-transfer --load-memory --train-max-steps=20500000 Transfer with Human Advice and Human Memory: python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --cuda-devices=0 --optimizer=RMS --lr=0.00025 --decay=0.95 --momentum=0.0 --epsilon=0.00001 --observe=0 --use-transfer --load-memory --use-human-model-as-advice --advice-confidence=0. --psi=0.9999975 --train-max-steps=20500000 Human Advice only with Human Memory: python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --cuda-devices=0 --optimizer=RMS --lr=0.00025 --decay=0.95 --momentum=0.0 --epsilon=0.00001 --observe=0 --load-memory --use-human-model-as-advice --advice-confidence=0.75 --psi=0.9999975 """ from dqn_net import DqnNet from dqn_net_class import DqnNetClass from dqn_training import DQNTraining if args.cpu_only: os.environ['CUDA_VISIBLE_DEVICES'] = '' else: if args.cuda_devices != '': os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices import tensorflow as tf if not os.path.exists('results/dqn'): os.makedirs('results/dqn') if args.folder is not None: folder = 'results/dqn/{}_{}'.format(args.gym_env.replace('-', '_'), args.folder) else: folder = 'results/dqn/{}_{}'.format(args.gym_env.replace('-', '_'), args.optimizer.lower()) end_str = '' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.target_consistency: end_str += '_tcloss' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.observe == 0: end_str += '_obs0' if args.init_epsilon < 1.0: end_str += '_lowinitexp' if args.use_human_model_as_advice: end_str += '_modelasadvice' if args.weight_decay is not None: end_str += '_wdecay' folder += end_str if args.append_experiment_num is not None: folder += '_' + args.append_experiment_num if args.cpu_only: device = '/cpu:0' gpu_options = None else: device = '/gpu:' + os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=False) game_state = GameState(env_id=args.gym_env, display=False, no_op_max=30, human_demo=False, episode_life=True) human_net = None sess_human = None if args.use_human_model_as_advice: if args.advice_folder is not None: advice_folder = args.advice_folder else: advice_folder = "{}_networks_classifier_{}".format( args.gym_env.replace('-', '_'), "adam") DqnNetClass.use_gpu = not args.cpu_only human_net = DqnNetClass(args.resized_height, args.resized_width, args.phi_len, game_state.env.action_space.n, args.gym_env, optimizer="Adam", learning_rate=0.0001, epsilon=0.001, decay=0., momentum=0., folder=advice_folder, device='/cpu:0') sess_human = tf.Session(config=config, graph=human_net.graph) human_net.initializer(sess_human) human_net.load() # prepare session sess = tf.Session(config=config) replay_memory = ReplayMemory( args.resized_width, args.resized_height, np.random.RandomState(), max_steps=args.replay_memory, phi_length=args.phi_len, num_actions=game_state.env.action_space.n, wrap_memory=True, full_state_size=game_state.clone_full_state().shape[0]) # baseline learning if not args.use_transfer: DqnNet.use_gpu = not args.cpu_only net = DqnNet(sess, args.resized_height, args.resized_width, args.phi_len, game_state.env.action_space.n, args.gym_env, gamma=args.gamma, optimizer=args.optimizer, learning_rate=args.lr, epsilon=args.epsilon, decay=args.decay, momentum=args.momentum, verbose=args.verbose, folder=folder, slow=args.use_slow, tau=args.tau, device=device, transformed_bellman=args.transformed_bellman, target_consistency_loss=args.target_consistency, clip_norm=args.grad_norm_clip, weight_decay=args.weight_decay) # transfer using existing model else: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format( args.gym_env.replace('-', '_')) end_str = '' end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' #TODO: make this an argument transfer_folder += end_str transfer_folder += '/transfer_model' DqnNet.use_gpu = not args.cpu_only net = DqnNet(sess, args.resized_height, args.resized_width, args.phi_len, game_state.env.action_space.n, args.gym_env, gamma=args.gamma, optimizer=args.optimizer, learning_rate=args.lr, epsilon=args.epsilon, decay=args.decay, momentum=args.momentum, verbose=args.verbose, folder=folder, slow=args.use_slow, tau=args.tau, transfer=True, transfer_folder=transfer_folder, not_transfer_conv2=args.not_transfer_conv2, not_transfer_conv3=args.not_transfer_conv3, not_transfer_fc1=args.not_transfer_fc1, not_transfer_fc2=args.not_transfer_fc2, device=device, transformed_bellman=args.transformed_bellman, target_consistency_loss=args.target_consistency, clip_norm=args.grad_norm_clip, weight_decay=args.weight_decay) ##added load human demonstration for testing cam demo_memory_folder = None demo_ids = None if args.load_memory or args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format( args.gym_env.replace('-', '_')) # demo_ids = tuple(map(int, args.demo_ids.split(","))) if args.unclipped_reward: reward_type = '' elif args.log_scale_reward: reward_type = 'LOG' else: reward_type = 'CLIP' experiment = DQNTraining(sess, net, game_state, args.resized_height, args.resized_width, args.phi_len, args.batch, args.gym_env, args.gamma, args.observe, args.explore, args.final_epsilon, args.init_epsilon, replay_memory, args.update_freq, args.save_freq, args.eval_freq, args.eval_max_steps, args.c_freq, folder, load_demo_memory=args.load_memory, demo_ids=args.demo_ids, load_demo_cam=args.load_demo_cam, demo_cam_id=args.demo_cam_id, demo_memory_folder=demo_memory_folder, train_max_steps=args.train_max_steps, human_net=human_net, confidence=args.advice_confidence, psi=args.psi, train_with_demo_steps=args.train_with_demo_steps, use_transfer=args.use_transfer, reward_type=reward_type) experiment.run() if args.use_human_model_as_advice: sess_human.close() sess.close()