def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, max_global_time_step, device): self.learn_rate = 0 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) # # self.apply_gradients = tf.train.RMSPropOptimizer( self.learning_rate_input).apply_gradients( zip(self.gradients, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.game_state = Game() self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.action_size = action_size self.gamma = gamma self.local_t_max = local_t_max self.agent_type = agent_type self.performance_log_interval = performance_log_interval self.log_level = log_level if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) self.local_network.prepare_loss(entropy_beta) with tf.device(device): var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device,task_index=""): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) if(global_network): self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.mode="threading"; else: self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients ) self.mode="dist_tensor"; if not (task_index): self.game_state = GameState(113 * thread_index) else: self.game_state = GameState(113 * task_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v.ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 tempdir = os.path.join(os.getcwd(), "results") self.res_file = os.path.join(tempdir, RESULTS_FILE) file = open(self.res_file, 'wb') file.write('itr,mean_score,max,min,std,runs,test_steps\n') file.close()
def __init__(self, thread_index, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_episode, device, arrived_jobs, condition): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_episode = max_global_time_episode # 通过thread_index 即机器编号来获取在该机器上加工的所有工序 self.operations = get_data_by_machine(thread_index) self.condition = condition self.is_terminal_counted = False self.last_episode_reward = 0 if USE_LSTM: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: # 第一个参数是action size,这里传入在该机器上代加工的工序数 self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients) # self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) # 创建该工序的环境 self.env = JspEnv(self.operations, thread_index, arrived_jobs) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if NETWORK_TYPE == 'LSTM': self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) elif NETWORK_TYPE == 'DILATED': self.local_network = GameACDilatedNetwork(ACTION_SIZE, device) elif NETWORK_TYPE == 'CONV': self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # STATE_SIZE = 6 - 3 Landmarks + 5 (comm-size) self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.epSteps = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
from constants import PARALLEL_SIZE from constants import MAX_TIME_STEP from constants import CHECKPOINT_DIR from constants import RMSP_EPSILON from constants import RMSP_ALPHA from constants import GRAD_NORM_CLIP from constants import USE_GPU from constants import USE_LSTM # use CPU for weight visualize tool device = "/cpu:0" if USE_LSTM: global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) else: global_network = GameACFFNetwork(ACTION_SIZE, -1, device) training_threads = [] learning_rate_input = tf.placeholder(PRECISION) grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init)
def run_a3c(args): """ python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> --load-pretrained-model --onevsall-mtl --pretrained-model-folder=<> --use-pretrained-model-as-advice --use-pretrained-model-as-reward-shaping """ from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf def log_uniform(lo, hi, rate): log_lo = math.log(lo) log_hi = math.log(hi) v = log_lo * (1 - rate) + log_hi * rate return math.exp(v) if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = 'results/a3c/{}_{}'.format(args.gym_env.replace('-', '_'), args.folder) else: folder = 'results/a3c/{}'.format(args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' folder += end_str if args.append_experiment_num is not None: folder += '_' + args.append_experiment_num if False: from common.util import LogFormatter fh = logging.FileHandler('{}/a3c.log'.format(folder), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) demo_memory = None num_demos = 0 max_reward = 0. if args.load_memory or args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format( args.gym_env.replace('-', '_')) if args.load_memory: # FIXME: use new load_memory function demo_memory, actions_ctr, max_reward = load_memory( args.gym_env, demo_memory_folder, imgs_normalized=True) #, create_symmetry=True) action_freq = [ actions_ctr[a] for a in range(demo_memory[0].num_actions) ] num_demos = len(demo_memory) demo_memory_cam = None if args.load_demo_cam: demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] demo_memory_cam = np.zeros((len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0 = (demo_cam[i])[0] demo_memory_cam[i] = np.copy(s0) del demo_cam logger.info("loaded demo {} for testing CAM".format(args.demo_cam_id)) device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:" + os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 pretrain_global_t = 0 pretrain_epoch = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) pretrained_model = None pretrained_model_sess = None if args.load_pretrained_model: if args.onevsall_mtl: from game_class_network import MTLBinaryClassNetwork as PretrainedModelNetwork elif args.onevsall_mtl_linear: from game_class_network import MTLMultivariateNetwork as PretrainedModelNetwork else: from game_class_network import MultiClassNetwork as PretrainedModelNetwork logger.error("Not supported yet!") assert False if args.pretrained_model_folder is not None: pretrained_model_folder = args.pretrained_model_folder else: pretrained_model_folder = '{}_classifier_use_mnih_onevsall_mtl'.format( args.gym_env.replace('-', '_')) PretrainedModelNetwork.use_mnih_2015 = args.use_mnih_2015 pretrained_model = PretrainedModelNetwork(action_size, -1, device) pretrained_model_sess = tf.Session(config=config, graph=pretrained_model.graph) pretrained_model.load( pretrained_model_sess, '{}/{}_checkpoint'.format(pretrained_model_folder, args.gym_env.replace('-', '_'))) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer(learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" n_shapers = args.parallel_size #int(args.parallel_size * .25) mod = args.parallel_size // n_shapers for i in range(args.parallel_size): is_reward_shape = False is_advice = False if i % mod == 0: is_reward_shape = args.use_pretrained_model_as_reward_shaping is_advice = args.use_pretrained_model_as_advice training_thread = A3CTrainingThread( i, global_network, initial_learning_rate, learning_rate_input, grad_applier, args.max_time_step, device=device, pretrained_model=pretrained_model, pretrained_model_sess=pretrained_model_sess, advice=is_advice, reward_shaping=is_reward_shape) training_threads.append(training_thread) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format( args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' #TODO: make this an argument transfer_folder += end_str transfer_folder += '/transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1 ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2 ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # summary writer for tensorboard summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter( 'results/log/a3c/{}/'.format(args.gym_env.replace('-', '_')) + folder[12:], sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=6) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(folder) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) # set wall time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'r') as f: wall_t = float(f.read()) with open(folder + '/pretrain_global_t', 'r') as f: pretrain_global_t = int(f.read()) with open(folder + '/model_best/best_model_reward', 'r') as f_best_model_reward: best_model_reward = float(f_best_model_reward.read()) rewards = pickle.load( open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'rb')) else: logger.warning("Could not find old checkpoint") # set wall time wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder + '/model_checkpoints', empty=True) prepare_dir(folder + '/model_best', empty=True) prepare_dir(folder + '/frames', empty=True) lock = threading.Lock() test_lock = False if global_t == 0: test_lock = True last_temp_global_t = global_t ispretrain_markers = [False] * args.parallel_size num_demo_thread = 0 ctr_demo_thread = 0 def train_function(parallel_index): nonlocal global_t, pretrain_global_t, pretrain_epoch, \ rewards, test_lock, lock, \ last_temp_global_t, ispretrain_markers, num_demo_thread, \ ctr_demo_thread training_thread = training_threads[parallel_index] training_thread.set_summary_writer(summary_writer) # set all threads as demo threads training_thread.is_demo_thread = args.load_memory and args.use_demo_threads if training_thread.is_demo_thread or args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs: training_thread.pretrain_init(demo_memory) if global_t == 0 and ( args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0) and parallel_index < 2: ispretrain_markers[parallel_index] = True training_thread.replay_mem_reset() # Pretraining with demo memory logger.info("t_idx={} pretrain starting".format(parallel_index)) while ispretrain_markers[parallel_index]: if stop_requested: return if pretrain_global_t > args.train_with_demo_num_steps and pretrain_epoch > args.train_with_demo_num_epochs: # At end of pretraining, reset state training_thread.replay_mem_reset() training_thread.episode_reward = 0 training_thread.local_t = 0 if args.use_lstm: training_thread.local_network.reset_state() ispretrain_markers[parallel_index] = False logger.info( "t_idx={} pretrain ended".format(parallel_index)) break diff_pretrain_global_t, _ = training_thread.demo_process( sess, pretrain_global_t) for _ in range(diff_pretrain_global_t): pretrain_global_t += 1 if pretrain_global_t % 10000 == 0: logger.debug( "pretrain_global_t={}".format(pretrain_global_t)) pretrain_epoch += 1 if pretrain_epoch % 1000 == 0: logger.debug("pretrain_epoch={}".format(pretrain_epoch)) # Waits for all threads to finish pretraining while not stop_requested and any(ispretrain_markers): time.sleep(0.01) # Evaluate model before training if not stop_requested and global_t == 0: with lock: if parallel_index == 0: test_reward, test_steps, test_episodes = training_threads[ 0].testing(sess, args.eval_max_steps, global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][global_t] = (test_reward, test_steps, test_episodes) saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t) save_best_model(test_reward) test_lock = False # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) # set start_time start_time = time.time() - wall_t training_thread.set_start_time(start_time) episode_end = True use_demo_thread = False while True: if stop_requested: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if args.use_demo_threads and global_t < args.max_steps_threads_as_demo and episode_end and num_demo_thread < 16: #if num_demo_thread < 2: demo_rate = 1.0 * (args.max_steps_threads_as_demo - global_t) / args.max_steps_threads_as_demo if demo_rate < 0.0333: demo_rate = 0.0333 if np.random.random() <= demo_rate and num_demo_thread < 16: ctr_demo_thread += 1 training_thread.replay_mem_reset(D_idx=ctr_demo_thread % num_demos) num_demo_thread += 1 logger.info( "idx={} as demo thread started ({}/16) rate={}".format( parallel_index, num_demo_thread, demo_rate)) use_demo_thread = True if use_demo_thread: diff_global_t, episode_end = training_thread.demo_process( sess, global_t) if episode_end: num_demo_thread -= 1 use_demo_thread = False logger.info("idx={} demo thread concluded ({}/16)".format( parallel_index, num_demo_thread)) else: diff_global_t, episode_end = training_thread.process( sess, global_t, rewards) for _ in range(diff_global_t): global_t += 1 if global_t % args.eval_freq == 0: temp_global_t = global_t lock.acquire() try: # catch multiple threads getting in at the same time if last_temp_global_t == temp_global_t: logger.info("Threading race problem averted!") continue test_lock = True test_reward, test_steps, n_episodes = training_thread.testing( sess, args.eval_max_steps, temp_global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][temp_global_t] = (test_reward, test_steps, n_episodes) if temp_global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save(sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format( args.gym_env.replace('-', '_')), global_step=temp_global_t, write_meta_graph=False) if test_reward > best_model_reward: save_best_model(test_reward) test_lock = False last_temp_global_t = temp_global_t finally: lock.release() if global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t, write_meta_graph=False) # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward best_model_reward = test_reward with open(folder + '/model_best/best_model_reward', 'w') as f_best_model_reward: f_best_model_reward.write(str(best_model_reward)) best_saver.save( sess, folder + '/model_best/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_'))) train_threads = [] for i in range(args.parallel_size): train_threads.append( threading.Thread(target=train_function, args=(i, ))) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) with open(folder + '/pretrain_global_t', 'w') as f: f.write(str(pretrain_global_t)) root_saver.save( sess, folder + '/{}_checkpoint_a3c'.format(args.gym_env.replace('-', '_')), global_step=global_t) pickle.dump( rewards, open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) logger.info('Data saved!') sess.close()
import matplotlib.pyplot as plt from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread from rmsprop_applier import RMSPropApplier import options options = options.options # use CPU for weight visualize tool device = "/cpu:0" if options.use_lstm: global_network = GameACLSTMNetwork(options.action_size, -1, device) else: global_network = GameACFFNetwork(options.action_size, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = options.rmsp_alpha, momentum = 0.0, epsilon = options.rmsp_epsilon, clip_norm = options.grad_norm_clip, device = device) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init)
def run_a3c_test(args): """Run A3C testing.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = args.folder else: folder = 'results/a3c/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 \ or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' if args.padding == 'SAME': end_str += '_same' folder += end_str folder = pathlib.Path(folder) demo_memory_cam = None demo_cam_human = False if args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME) demo_memory_folder = pathlib.Path(demo_memory_folder) if args.demo_cam_id is not None: demo_cam_human = True demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] logger.info("loaded demo {} for testing CAM".format( args.demo_cam_id)) else: demo_cam_folder = pathlib.Path(args.demo_cam_folder) demo_cam = ReplayMemory() demo_cam.load(name='test_cam', folder=demo_cam_folder) logger.info("loaded demo {} for testing CAM".format( str(demo_cam_folder / 'test_cam'))) demo_memory_cam = np.zeros( (len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0, _, _, _, _, _, t1, _ = demo_cam[i] demo_memory_cam[i] = np.copy(s0) del demo_cam device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) input_shape = (84, 84, 4) if args.padding == 'VALID' else (88, 88, 4) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = \ args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" if args.use_lstm: local_network = GameACLSTMNetwork(action_size, 0, device) else: local_network = GameACFFNetwork( action_size, 0, device, padding=args.padding, in_shape=input_shape) testing_thread = A3CTrainingThread( 0, global_network, local_network, initial_learning_rate, learning_rate_input, grad_applier, 0, device=device) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' # TODO: make this an argument transfer_folder += end_str transfer_folder = pathlib.Path(transfer_folder) transfer_folder /= 'transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list, ) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) else: logger.warning("Could not find old checkpoint") def test_function(): nonlocal global_t if args.use_transfer: from_folder = str(transfer_folder).split('/')[-2] else: from_folder = str(folder).split('/')[-1] from_folder = pathlib.Path(from_folder) save_folder = 'results/test_model/a3c' / from_folder prepare_dir(str(save_folder), empty=False) prepare_dir(str(save_folder / 'frames'), empty=False) # Evaluate model before training if not stop_requested: testing_thread.testing_model( sess, args.eval_max_steps, global_t, save_folder, demo_memory_cam=demo_memory_cam, demo_cam_human=demo_cam_human) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) test_thread = threading.Thread(target=test_function, args=()) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) test_thread.start() print('Press Ctrl+C to stop') test_thread.join() sess.close()
def display(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip, agent_type, action_size, rand_seed, checkpoint_dir, display_time_sleep, display_episodes, display_log_level, display_save_log, show_max): # use CPU for display tool device = "/cpu:0" LOG_FILE = 'log_{}-{}.txt'.format(experiment_name, agent_type) if agent_type == 'LSTM': global_network = GameACLSTMNetwork(action_size, -1, device) else: global_network = GameACFFNetwork(action_size, -1, device) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=rmsp_alpha, momentum=0.0, epsilon=rmsp_epsilon, clip_norm=grad_norm_clip, device=device) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") episode = 0 terminal = False episode_rewards = [] episode_steps = [] episode_passed_obsts = [] print ' ' print 'DISPLAYING {} EPISODES'.format(display_episodes) print '--------------------------------------------------- ' while not episode == display_episodes: episode_reward = 0 episode_passed_obst = 0 game_state = GameState(rand_seed, action_size, show_score=True) if display_log_level == 'FULL': print 'EPISODE {}'.format(episode) full_frame = None while True: pi_values, value = global_network.run_policy_and_value( sess, game_state.s_t) action = choose_action(pi_values) game_state.process(action) terminal = game_state.terminal episode_step = game_state.steps reward = game_state.reward passed_obst = game_state.passed_obst if len(episode_passed_obsts) == 0 and show_max: if passed_obst > 0: full_frame = game_state.full_frame elif episode_passed_obst > np.max( episode_passed_obsts) and show_max: full_frame = game_state.full_frame episode_reward += reward episode_passed_obst = passed_obst if display_log_level == 'FULL': print 'step / pi_values: {} / value: {} / action: {} / reward: {} / passed_obst: {}'.format( pi_values, value, action, reward, passed_obst) time.sleep(display_time_sleep) if not terminal: game_state.update() else: break episode_rewards.append(episode_reward) episode_steps.append(episode_step) episode_passed_obsts.append(episode_passed_obst) if not display_log_level == 'NONE': reward_steps = format( float(episode_reward) / float(episode_step), '.4f') print "EPISODE: {} / STEPS: {} / PASSED OBST: {} / REWARD: {} / REWARD/STEP: {}".format( episode, episode_step, passed_obst, episode_reward, reward_steps) if display_save_log: with open(LOG_FILE, "a") as text_file: text_file.write('{},{},{},{},{}\n'.format( episode, episode_step, passed_obst, episode_reward, reward_steps)) episode += 1 print '--------------------------------------------------- ' print 'DISPLAY SESSION FINISHED' print 'TOTAL EPISODES: {}'.format(display_episodes) print ' ' print 'MIN' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.min(episode_rewards), np.min(episode_steps), np.min(episode_passed_obsts)) print ' ' print 'AVERAGE' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.average(episode_rewards), np.average(episode_steps), np.average(episode_passed_obsts)) print ' ' print 'MAX' print 'REWARD: {} / STEPS: {} / PASSED OBST: {}'.format( np.max(episode_rewards), np.max(episode_steps), np.max(episode_passed_obsts)) if show_max and not full_frame == None: plt.imshow(full_frame, origin='lower') plt.show()
if not settings.mode == 'display' and not settings.mode == 'visualize': device = "/cpu:0" if settings.use_gpu: device = "/gpu:0" initial_learning_rates = log_uniform(settings.initial_alpha_low, settings.initial_alpha_high, settings.parallel_agent_size) global_t = 0 stop_requested = False if settings.agent_type == 'LSTM': global_network = GameACLSTMNetwork(settings.action_size, -1, device) else: global_network = GameACFFNetwork(settings.action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=settings.rmsp_alpha, momentum=0.0, epsilon=settings.rmsp_epsilon, clip_norm=settings.grad_norm_clip, device=device) for i in range(settings.parallel_agent_size): training_thread = A3CTrainingThread( i, global_network, initial_learning_rates[i], learning_rate_input,
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, options): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.options = options if options.use_lstm: self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(options.action_size, device) self.local_network.prepare_loss(options.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.indent = " |" * self.thread_index self.steps = 0 self.no_reward_steps = 0 self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0) if self.options.train_episode_steps > 0: self.max_reward = 0.0 self.max_episode_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores = Episode_scores(options) self.tes = self.options.train_episode_steps if self.options.tes_list is not None: self.tes = self.options.tes_list[thread_index] print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes)) self.initial_lives = self.game_state.initial_lives self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1) if self.options.record_new_record_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_record_dir): os.makedirs(self.options.record_new_record_dir) self.episode_screens = [] if self.options.record_new_room_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_room_dir): os.makedirs(self.options.record_new_room_dir) self.episode_screens = [] self.greediness = options.greediness self.repeat_action_ratio = options.repeat_action_ratio self.prev_action = 0
def visualize(experiment_name, rmsp_alpha, rmsp_epsilon, grad_norm_clip, agent_type, action_size, rand_seed, checkpoint_dir): # use CPU for weight visualize tool device = "/cpu:0" if agent_type == 'LSTM': global_network = GameACLSTMNetwork(action_size, -1, device) else: global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=rmsp_alpha, momentum=0.0, epsilon=rmsp_epsilon, clip_norm=grad_norm_clip, device=device) game = GameState(rand_seed, action_size) game.process(0) x_t = game.x_t plt.imshow(x_t, interpolation="nearest", cmap=plt.cm.gray) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") W_conv1 = sess.run(global_network.W_conv1) # show graph of W_conv1 fig, axes = plt.subplots(4, 16, figsize=(12, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(4 * 16)): inch = i // 16 outch = i % 16 img = W_conv1[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() W_conv2 = sess.run(global_network.W_conv2) # show graph of W_conv2 fig, axes = plt.subplots(2, 32, figsize=(27, 6), subplot_kw={ 'xticks': [], 'yticks': [] }) fig.subplots_adjust(hspace=0.1, wspace=0.1) for ax, i in zip(axes.flat, range(2 * 32)): inch = i // 32 outch = i % 32 img = W_conv2[:, :, inch, outch] ax.imshow(img, cmap=plt.cm.gray, interpolation='nearest') ax.set_title(str(inch) + "," + str(outch)) plt.show() arr = sess.run(global_network.get_vars()) s = tf.placeholder("float", [None, 84, 84, 4]) b_conv1 = sess.run(global_network.b_conv1) b_conv2 = sess.run(global_network.b_conv2) inp_1 = tf.nn.conv2d(s, W_conv1, strides=[1, 4, 4, 1], padding="VALID") h_conv1 = tf.nn.relu(inp_1 + b_conv1) inp_2 = tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding="VALID") h_conv2 = tf.nn.relu(inp_2 + b_conv2) s_t = game.s_t getActivations(sess, s, h_conv1, s_t, 16) getActivations(sess, s, h_conv2, s_t, 32)
def run_a3c(args): """Run A3C experiment.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') GAME_NAME = args.gym_env.replace('NoFrameskip-v4','') # setup folder name and path to folder folder = pathlib.Path(setup_folder(args, GYM_ENV_NAME)) # setup GPU (if applicable) import tensorflow as tf gpu_options = setup_gpu(tf, args.use_gpu, args.gpu_fraction) ###################################################### # setup default device device = "/cpu:0" global_t = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) if args.load_pretrained_model: class_rewards = {'class_eval': {}} # setup logging info for analysis, see Section 4.2 of the paper sil_dict = { # count number of SIL updates "sil_ctr":{}, # total number of butter D sampled during SIL "sil_a3c_sampled":{}, # total number of buffer D samples (i.e., generated by A3C workers) used during SIL (i.e., passed max op) "sil_a3c_used":{}, # the return of used samples for buffer D "sil_a3c_used_return":{}, # total number of buffer R sampled during SIL "sil_rollout_sampled":{}, # total number of buffer R samples (i.e., generated by refresher worker) used during SIL (i.e., passed max op) "sil_rollout_used":{}, # the return of used samples for buffer R "sil_rollout_used_return":{}, # number of old samples still used (even after refreshing) "sil_old_used":{} } sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return = 0, 0, 0, 0 sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return = 0, 0, 0 sil_old_used = 0 rollout_dict = { # total number of rollout performed "rollout_ctr": {}, # total number of successful rollout (i.e., Gnew > G) "rollout_added_ctr":{}, # the return of Gnew "rollout_new_return":{}, # the return of G "rollout_old_return":{} } rollout_ctr, rollout_added_ctr = 0, 0 rollout_new_return = 0 # this records the total, avg = this / rollout_added_ctr rollout_old_return = 0 # this records the total, avg = this / rollout_added_ctr # setup file names reward_fname = folder / '{}-a3c-rewards.pkl'.format(GYM_ENV_NAME) sil_fname = folder / '{}-a3c-dict-sil.pkl'.format(GYM_ENV_NAME) rollout_fname = folder / '{}-a3c-dict-rollout.pkl'.format(GYM_ENV_NAME) if args.load_pretrained_model: class_reward_fname = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) sharedmem_fname = folder / '{}-sharedmem.pkl'.format(GYM_ENV_NAME) sharedmem_params_fname = folder / '{}-sharedmem-params.pkl'.format(GYM_ENV_NAME) sharedmem_trees_fname = folder / '{}-sharedmem-trees.pkl'.format(GYM_ENV_NAME) rolloutmem_fname = folder / '{}-rolloutmem.pkl'.format(GYM_ENV_NAME) rolloutmem_params_fname = folder / '{}-rolloutmem-params.pkl'.format(GYM_ENV_NAME) rolloutmem_trees_fname = folder / '{}-rolloutmem-trees.pkl'.format(GYM_ENV_NAME) # for removing older ckpt, save mem space prev_ckpt_t = -1 stop_req = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state input_shape = (args.input_shape, args.input_shape, 4) ####################################################### # setup global A3C GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) logger.info('A3C Initial Learning Rate={}'.format(args.initial_learn_rate)) # setup pretrained model global_pretrained_model = None local_pretrained_model = None pretrain_graph = None # if use pretrained model to refresh # then must load pretrained model # otherwise, don't load model if args.use_lider and args.nstep_bc > 0: assert args.load_pretrained_model, "refreshing with other policies, must load a pre-trained model (TA or BC)" else: assert not args.load_pretrained_model, "refreshing with the current policy, don't load pre-trained models" if args.load_pretrained_model: pretrain_graph, global_pretrained_model = setup_pretrained_model(tf, args, action_size, input_shape, device="/gpu:0" if args.use_gpu else device) assert global_pretrained_model is not None assert pretrain_graph is not None time.sleep(2.0) # setup experience memory shared_memory = None # => this is BufferD rollout_buffer = None # => this is BufferR if args.use_sil: shared_memory = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) if args.use_lider and not args.onebuffer: rollout_buffer = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) # log memory information shared_memory.log() if args.use_lider and not args.onebuffer: rollout_buffer.log() ############## Setup Thread Workers BEGIN ################ # 17 total number of threads for all experiments assert args.parallel_size ==17, "use 17 workers for all experiments" startIndex = 0 all_workers = [] # a3c and sil learning rate and optimizer learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) setup_common_worker(CommonWorker, args, action_size) # setup SIL worker sil_worker = None if args.use_sil: _device = "/gpu:0" if args.use_gpu else device sil_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) sil_worker = SILTrainingThread(startIndex, global_network, sil_network, args.initial_learn_rate, learning_rate_input, grad_applier, device=_device, batch_size=args.batch_size, use_rollout=args.use_lider, one_buffer=args.onebuffer, sampleR=args.sampleR) all_workers.append(sil_worker) startIndex += 1 # setup refresh worker refresh_worker = None if args.use_lider: _device = "/gpu:0" if args.use_gpu else device refresh_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) refresh_local_pretrained_model = None # if refreshing with other polies if args.nstep_bc > 0: refresh_local_pretrained_model = PretrainedModelNetwork( pretrain_graph, action_size, startIndex, padding=args.padding, in_shape=input_shape, sae=False, tied_weights=False, use_denoising=False, noise_factor=0.3, loss_function='mse', use_slv=False, device=_device) refresh_worker = RefreshThread( thread_index=startIndex, action_size=action_size, env_id=args.gym_env, global_a3c=global_network, local_a3c=refresh_network, update_in_rollout=args.update_in_rollout, nstep_bc=args.nstep_bc, global_pretrained_model=global_pretrained_model, local_pretrained_model=refresh_local_pretrained_model, transformed_bellman = args.transformed_bellman, device=_device, entropy_beta=args.entropy_beta, clip_norm=args.grad_norm_clip, grad_applier=grad_applier, initial_learn_rate=args.initial_learn_rate, learning_rate_input=learning_rate_input) all_workers.append(refresh_worker) startIndex += 1 # setup a3c workers setup_a3c_worker(A3CTrainingThread, args, startIndex) for i in range(startIndex, args.parallel_size): local_network = GameACFFNetwork( action_size, i, device="/cpu:0", padding=args.padding, in_shape=input_shape) a3c_worker = A3CTrainingThread( i, global_network, local_network, args.initial_learn_rate, learning_rate_input, grad_applier, device="/cpu:0", no_op_max=30) all_workers.append(a3c_worker) ############## Setup Thread Workers END ################ # setup config for tensorflow config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) # prepare sessions sess = tf.Session(config=config) pretrain_sess = None if global_pretrained_model: pretrain_sess = tf.Session(config=config, graph=pretrain_graph) # initial pretrained model if pretrain_sess: assert args.pretrained_model_folder is not None global_pretrained_model.load( pretrain_sess, args.pretrained_model_folder) sess.run(tf.global_variables_initializer()) if global_pretrained_model: initialize_uninitialized(tf, pretrain_sess, global_pretrained_model) if local_pretrained_model: initialize_uninitialized(tf, pretrain_sess, local_pretrained_model) # summary writer for tensorboard summ_file = args.save_to+'log/a3c/{}/'.format(GYM_ENV_NAME) + str(folder)[58:] # str(folder)[12:] summary_writer = tf.summary.FileWriter(summ_file, sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=1) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)+'/model_checkpoints') if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) tmp_t = (global_t // args.eval_freq) * args.eval_freq logger.info(">>> tmp_t: {}".format(tmp_t)) # set wall time wall_t = 0. # set up reward files best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('r') as f: best_model_reward = float(f.read()) # restore rewards rewards = restore_dict(reward_fname, global_t) logger.info(">>> restored: rewards") # restore loggings sil_dict = restore_dict(sil_fname, global_t) sil_ctr = sil_dict['sil_ctr'][tmp_t] sil_a3c_sampled = sil_dict['sil_a3c_sampled'][tmp_t] sil_a3c_used = sil_dict['sil_a3c_used'][tmp_t] sil_a3c_used_return = sil_dict['sil_a3c_used_return'][tmp_t] sil_rollout_sampled = sil_dict['sil_rollout_sampled'][tmp_t] sil_rollout_used = sil_dict['sil_rollout_used'][tmp_t] sil_rollout_used_return = sil_dict['sil_rollout_used_return'][tmp_t] sil_old_used = sil_dict['sil_old_used'][tmp_t] logger.info(">>> restored: sil_dict") rollout_dict = restore_dict(rollout_fname, global_t) rollout_ctr = rollout_dict['rollout_ctr'][tmp_t] rollout_added_ctr = rollout_dict['rollout_added_ctr'][tmp_t] rollout_new_return = rollout_dict['rollout_new_return'][tmp_t] rollout_old_return = rollout_dict['rollout_old_return'][tmp_t] logger.info(">>> restored: rollout_dict") if args.load_pretrained_model: class_reward_file = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) class_rewards = restore_dict(class_reward_file, global_t) # restore replay buffers (if saved) if args.checkpoint_buffer: # restore buffer D if args.use_sil and args.priority_memory: shared_memory = restore_buffer(sharedmem_fname, shared_memory, global_t) shared_memory = restore_buffer_trees(sharedmem_trees_fname, shared_memory, global_t) shared_memory = restore_buffer_params(sharedmem_params_fname, shared_memory, global_t) logger.info(">>> restored: shared_memory (Buffer D)") shared_memory.log() # restore buffer R if args.use_lider and not args.onebuffer: rollout_buffer = restore_buffer(rolloutmem_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_trees(rolloutmem_trees_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_params(rolloutmem_params_fname, rollout_buffer, global_t) logger.info(">>> restored: rollout_buffer (Buffer R)") rollout_buffer.log() # if all restores okay, remove old ckpt to save storage space prev_ckpt_t = global_t else: logger.warning("Could not find old checkpoint") wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder / 'model_checkpoints', empty=True) prepare_dir(folder / 'model_best', empty=True) prepare_dir(folder / 'frames', empty=True) lock = threading.Lock() # next saving global_t def next_t(current_t, freq): return np.ceil((current_t + 0.00001) / freq) * freq next_global_t = next_t(global_t, args.eval_freq) next_save_t = next_t( global_t, args.eval_freq*args.checkpoint_freq) step_t = 0 def train_function(parallel_idx, th_ctr, ep_queue, net_updates): nonlocal global_t, step_t, rewards, class_rewards, lock, \ next_save_t, next_global_t, prev_ckpt_t nonlocal shared_memory, rollout_buffer nonlocal sil_dict, sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return, \ sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return, \ sil_old_used nonlocal rollout_dict, rollout_ctr, rollout_added_ctr, \ rollout_new_return, rollout_old_return parallel_worker = all_workers[parallel_idx] parallel_worker.set_summary_writer(summary_writer) with lock: # Evaluate model before training if not stop_req and global_t == 0 and step_t == 0: rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, global_t, folder, worker=all_workers[-1]) # testing pretrained TA or BC in game if args.load_pretrained_model: assert pretrain_sess is not None assert global_pretrained_model is not None class_rewards['class_eval'][step_t] = \ parallel_worker.test_loaded_classifier(global_t=global_t, max_eps=50, # testing 50 episodes sess=pretrain_sess, worker=all_workers[-1], model=global_pretrained_model) # log pretrained model performance class_eval_file = pathlib.Path(args.pretrained_model_folder[:21]+\ str(GAME_NAME)+"/"+str(GAME_NAME)+'-model-eval.txt') class_std = np.std(class_rewards['class_eval'][step_t][-1]) class_mean = np.mean(class_rewards['class_eval'][step_t][-1]) with class_eval_file.open('w') as f: f.write("class_mean: \n" + str(class_mean) + "\n") f.write("class_std: \n" + str(class_std) + "\n") f.write("class_rewards: \n" + str(class_rewards['class_eval'][step_t][-1]) + "\n") checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t) save_best_model(rewards['eval'][global_t][0]) # saving worker info to dicts for analysis sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works under priority mem) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saving shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saving rollout_buffer') prev_ckpt_t = global_t step_t = 1 # set start_time start_time = time.time() - wall_t parallel_worker.set_start_time(start_time) if parallel_worker.is_sil_thread: sil_interval = 0 # bigger number => slower SIL updates m_repeat = 4 min_mem = args.batch_size * m_repeat sil_train_flag = len(shared_memory) >= min_mem while True: if stop_req: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if parallel_worker.is_sil_thread: # before sil starts, init local count local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 if net_updates.qsize() >= sil_interval \ and len(shared_memory) >= min_mem: sil_train_flag = True if sil_train_flag: sil_train_flag = False th_ctr.get() train_out = parallel_worker.sil_train( sess, global_t, shared_memory, m_repeat, rollout_buffer=rollout_buffer) local_sil_ctr, local_sil_a3c_sampled, local_sil_a3c_used, \ local_sil_a3c_used_return, \ local_sil_rollout_sampled, local_sil_rollout_used, \ local_sil_rollout_used_return, \ local_sil_old_used = train_out th_ctr.put(1) with net_updates.mutex: net_updates.queue.clear() if args.use_lider: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, a3c_used_return=sil_a3c_used_return/(sil_a3c_used+1),#add one in case divide by zero rollout_used=sil_rollout_used, rollout_used_return=sil_rollout_used_return/(sil_rollout_used+1), old_used=sil_old_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: rollout_buffsize = 0 if not args.onebuffer: rollout_buffsize = len(rollout_buffer) log_data = (sil_ctr, len(shared_memory), rollout_buffsize, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, sil_a3c_used_return/(sil_a3c_used+1), sil_rollout_used, sil_rollout_used_return/(sil_rollout_used+1), sil_old_used) logger.info("SIL: sil_ctr={0:}" " sil_memory_size={1:}" " rollout_buffer_size={2:}" " total_sample_used={3:}/{4:}" " a3c_used={5:}" " a3c_used_return_avg={6:.2f}" " rollout_used={7:}" " rollout_used_return_avg={8:.2f}" " old_used={9:}".format(*log_data)) else: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, rollout_used=sil_rollout_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: log_data = (sil_ctr, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, len(shared_memory)) logger.info("SIL: sil_ctr={0:}" " total_sample_used={1:}/{2:}" " a3c_used={3:}" " sil_memory_size={4:}".format(*log_data)) # Adding episodes to SIL memory is centralize to ensure # sampling and updating of priorities does not become a problem # since we add new episodes to SIL at once and during # SIL training it is guaranteed that SIL memory is untouched. max = args.parallel_size while not ep_queue.empty(): data = ep_queue.get() parallel_worker.episode.set_data(*data) shared_memory.extend(parallel_worker.episode) parallel_worker.episode.reset() max -= 1 if max <= 0: # This ensures that SIL has a chance to train break diff_global_t = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 elif parallel_worker.is_refresh_thread: # before refresh starts, init local count diff_global_t = 0 local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 if len(shared_memory) >= 1: th_ctr.get() # randomly sample a state from buffer D sample = shared_memory.sample_one_random() # after sample, flip refreshed to True # TODO: fix this so that only *succesful* refresh is flipped to True # currently counting *all* refresh as True assert sample[-1] == True train_out = parallel_worker.rollout(sess, folder, pretrain_sess, global_t, sample, args.addall, args.max_ep_step, args.nstep_bc, args.update_in_rollout) diff_global_t, episode_end, part_end, local_rollout_ctr, \ local_rollout_added_ctr, add, local_rollout_new_return, \ local_rollout_old_return = train_out th_ctr.put(1) if rollout_ctr % 20 == 0 and rollout_ctr > 0: log_msg = "ROLLOUT: rollout_ctr={} added_rollout_ct={} worker={}".format( rollout_ctr, rollout_added_ctr, parallel_worker.thread_idx) logger.info(log_msg) logger.info("ROLLOUT Gnew: {}, G: {}".format(local_rollout_new_return, local_rollout_old_return)) # should always part_end, i.e., end of episode # and only add if new return is better (if not LiDER-AddAll) if part_end and add: if not args.onebuffer: # directly put into Buffer R rollout_buffer.extend(parallel_worker.episode) else: # Buffer D add sample is centralized when OneBuffer ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # a3c training thread worker else: th_ctr.get() train_out = parallel_worker.train(sess, global_t, rewards) diff_global_t, episode_end, part_end = train_out th_ctr.put(1) if args.use_sil: net_updates.put(1) if part_end: ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 # ensure only one thread is updating global_t at a time with lock: global_t += diff_global_t # centralize increasing count for SIL and Rollout sil_ctr += local_sil_ctr sil_a3c_sampled += local_sil_a3c_sampled sil_a3c_used += local_sil_a3c_used sil_a3c_used_return += local_sil_a3c_used_return sil_rollout_sampled += local_sil_rollout_sampled sil_rollout_used += local_sil_rollout_used sil_rollout_used_return += local_sil_rollout_used_return sil_old_used += local_sil_old_used rollout_ctr += local_rollout_ctr rollout_added_ctr += local_rollout_added_ctr rollout_new_return += local_rollout_new_return rollout_old_return += local_rollout_old_return # if during a thread's update, global_t has reached a evaluation interval if global_t > next_global_t: next_global_t = next_t(global_t, args.eval_freq) step_t = int(next_global_t - args.eval_freq) # wait for all threads to be done before testing while not stop_req and th_ctr.qsize() < len(all_workers): time.sleep(0.001) step_t = int(next_global_t - args.eval_freq) # Evaluate for 125,000 steps rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, step_t, folder, worker=all_workers[-1]) save_best_model(rewards['eval'][step_t][0]) last_reward = rewards['eval'][step_t][0] # saving worker info to dicts # SIL sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used # ROLLOUT rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # save ckpt after done with eval if global_t > next_save_t: next_save_t = next_t(global_t, args.eval_freq*args.checkpoint_freq) # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works for priority mem for now) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saved shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saved rollout_buffer') # save a3c after saving buffer -- in case saving buffer OOM # so that at least we can revert back to the previous ckpt checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t, write_meta_graph=False) logger.info('Saved model ckpt') # if everything saves okay, clean up previous ckpt to save space remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Removed ckpt from step {}'.format(prev_ckpt_t)) prev_ckpt_t = global_t def signal_handler(signal, frame): nonlocal stop_req logger.info('You pressed Ctrl+C!') stop_req = True if stop_req and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward if test_reward > best_model_reward: best_model_reward = test_reward best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('w') as f: f.write(str(best_model_reward)) best_checkpt_file = folder / 'model_best' best_checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) best_saver.save(sess, str(best_checkpt_file)) train_threads = [] th_ctr = Queue() for i in range(args.parallel_size): th_ctr.put(1) episodes_queue = None net_updates = None if args.use_sil: episodes_queue = Queue() net_updates = Queue() for i in range(args.parallel_size): worker_thread = Thread( target=train_function, args=(i, th_ctr, episodes_queue, net_updates,)) train_threads.append(worker_thread) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder / 'wall_t.{}'.format(global_t) with wall_t_fname.open('w') as f: f.write(str(wall_t)) # save final model checkpoint_file = str(folder / '{}_checkpoint_a3c'.format(GYM_ENV_NAME)) root_saver.save(sess, checkpoint_file, global_step=global_t) dump_final_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname]) logger.info('Data saved!') # if everything saves okay & is done training (not because of pressed Ctrl+C), # clean up previous ckpt to save space if global_t >= (args.max_time_step * args.max_time_step_fraction): remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Done training, removed ckpt from step {}'.format(prev_ckpt_t)) sess.close() if pretrain_sess: pretrain_sess.close()
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, action_size, gamma, local_t_max, entropy_beta, agent_type, performance_log_interval, log_level, random_seed): self.thread_index = thread_index self.learning_rate_input = learning_rate_input #每个worker不同 self.max_global_time_step = max_global_time_step #4000w steps self.action_size = action_size #2 self.gamma = gamma # 0.99 self.local_t_max = local_t_max # 256 self.agent_type = agent_type #FF self.performance_log_interval = performance_log_interval self.log_level = log_level #初始化worker的网络 if self.agent_type == 'LSTM': self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, thread_index, device) #创建一下loss的相关变量 self.local_network.prepare_loss(entropy_beta) with tf.device(device): #获取worker网络的参数 #[self.W_conv1, self.b_conv1, self.W_conv2, self.b_conv2,self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2,self.W_fc3, self.b_fc3] var_refs = [] variables = self.local_network.get_vars() for v in variables: var_refs.append(v) #计算梯度, self.gradients = tf.gradients(self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) #更新网络 self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) #拉取global网络参数 self.sync = self.local_network.sync_from(global_network) #初始化游戏环境 np.random.seed(random_seed) self.game_state = GameState(random_seed * thread_index, self.action_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.learn_rate = self.initial_learning_rate #重置一些计数器 self.reset_counters() self.episode = 0 # variable controling log output self.prev_local_t = 0
def make_network(): if USE_LSTM: return GameACLSTMNetwork(ACTION_SIZE, -1, device) else: return GameACFFNetwork(ACTION_SIZE, device)
def __init__(self, thread_index, global_network, pinitial_learning_rate, plearning_rate_input, pgrad_applier, vinitial_learning_rate, vlearning_rate_input, vgrad_applier, max_global_time_step, device, task_index=""): self.thread_index = thread_index self.plearning_rate_input = plearning_rate_input self.vlearning_rate_input = vlearning_rate_input self.max_global_time_step = max_global_time_step self.game_state = GameState() state = self.game_state.reset() self.game_state.reset_gs(state) self.action_size = self.game_state.action_size self.state_size = self.game_state.state_size self.local_max_iter = self.game_state.local_max_iter if USE_LSTM: self.local_network = GameACLSTMNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size, self.state_size, self.game_state.action_low, self.game_state.action_high, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): pvar_refs = [v._ref() for v in self.local_network.get_pvars()] self.policy_gradients = tf.gradients( self.local_network.policy_loss, pvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) vvar_refs = [v._ref() for v in self.local_network.get_vvars()] self.value_gradients = tf.gradients( self.local_network.value_loss, vvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_policy_gradients = pgrad_applier.apply_gradients( self.local_network.get_pvars(), self.policy_gradients) self.apply_value_gradients = vgrad_applier.apply_gradients( self.local_network.get_vvars(), self.value_gradients) self.local_t = 0 self.pinitial_learning_rate = pinitial_learning_rate self.vinitial_learning_rate = vinitial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device=None, pretrained_model=None, pretrained_model_sess=None, advice=False, reward_shaping=False): assert self.action_size != -1 self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.use_pretrained_model_as_advice = advice self.use_pretrained_model_as_reward_shaping = reward_shaping logger.info("thread_index: {}".format(self.thread_index)) logger.info("local_t_max: {}".format(self.local_t_max)) logger.info("use_lstm: {}".format( colored(self.use_lstm, "green" if self.use_lstm else "red"))) logger.info("action_size: {}".format(self.action_size)) logger.info("entropy_beta: {}".format(self.entropy_beta)) logger.info("gamma: {}".format(self.gamma)) logger.info("reward_type: {}".format(self.reward_type)) logger.info("finetune_upper_layers_only: {}".format( colored(self.finetune_upper_layers_only, "green" if self.finetune_upper_layers_only else "red"))) logger.info("use_pretrained_model_as_advice: {}".format( colored( self.use_pretrained_model_as_advice, "green" if self.use_pretrained_model_as_advice else "red"))) logger.info("use_pretrained_model_as_reward_shaping: {}".format( colored( self.use_pretrained_model_as_reward_shaping, "green" if self.use_pretrained_model_as_reward_shaping else "red"))) logger.info("transformed_bellman: {}".format( colored(self.transformed_bellman, "green" if self.transformed_bellman else "red"))) logger.info("clip_norm: {}".format(self.clip_norm)) logger.info("use_grad_cam: {}".format( colored(self.use_grad_cam, "green" if self.use_grad_cam else "red"))) if self.use_lstm: GameACLSTMNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACLSTMNetwork(self.action_size, thread_index, device) else: GameACFFNetwork.use_mnih_2015 = self.use_mnih_2015 self.local_network = GameACFFNetwork(self.action_size, thread_index, device) with tf.device(device): self.local_network.prepare_loss(entropy_beta=self.entropy_beta, critic_lr=0.5) local_vars = self.local_network.get_vars if self.finetune_upper_layers_only: local_vars = self.local_network.get_vars_upper var_refs = [v._ref() for v in local_vars()] self.gradients = tf.gradients(self.local_network.total_loss, var_refs) global_vars = global_network.get_vars if self.finetune_upper_layers_only: global_vars = global_network.get_vars_upper with tf.device(device): if self.clip_norm is not None: self.gradients, grad_norm = tf.clip_by_global_norm( self.gradients, self.clip_norm) self.gradients = list(zip(self.gradients, global_vars())) self.apply_gradients = grad_applier.apply_gradients(self.gradients) #self.apply_gradients = grad_applier.apply_gradients( # global_vars(), # self.gradients) self.sync = self.local_network.sync_from( global_network, upper_layers_only=self.finetune_upper_layers_only) self.game_state = GameState(env_id=self.env_id, display=False, no_op_max=30, human_demo=False, episode_life=True) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_steps = 0 # variable controlling log output self.prev_local_t = 0 self.is_demo_thread = False with tf.device(device): if self.use_grad_cam: self.action_meaning = self.game_state.env.unwrapped.get_action_meanings( ) self.local_network.build_grad_cam_grads() self.pretrained_model = pretrained_model self.pretrained_model_sess = pretrained_model_sess self.psi = 0.9 if self.use_pretrained_model_as_advice else 0.0 self.advice_ctr = 0 self.shaping_ctr = 0 self.last_rho = 0. if self.use_pretrained_model_as_advice or self.use_pretrained_model_as_reward_shaping: assert self.pretrained_model is not None
if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) global_t = 0 stop_requested = False global_game = DoomGameState(scenario_path="scenarios/cig.cfg") if USE_LSTM: global_network = GameACLSTMNetwork(global_game.get_action_size(), -1, device) else: global_network = GameACFFNetwork(global_game.get_action_size(), -1, device) del global_game training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) for i in range(PARALLEL_SIZE): game = DoomGameState(scenario_path="scenarios/cig.cfg")