def main(): env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-421, max_val=421)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=2000000, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=64, batch_size=32, min_buffer_size=25000)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=StochasticMaxStochasticDeltaDeletionPRB(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): """Run DQN until the environment throws an exception.""" # env = make(game='SonicAndKnuckles3-Genesis', state='AngelIslandZone.Act1') # env = SonicDiscretizer(env) # env = WarpFrame(env) # env = AllowBacktracking(env) env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 4) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.7, 0.6, epsilon=0.2), optimize_op=optimize, train_interval=1, target_interval=16384, batch_size=64, min_buffer_size=20000)
def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print(tf.trainable_variables()) save_path='/home/noob/retro-noob/rainbow/params/params' utils.save_state(save_path+'_tf_saver') with tf.variable_scope('model'): params = tf.trainable_variables() ps = sess.run(params) joblib.dump(ps, save_path + '_joblib')
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) """ Create a TF Op that optimizes the objective. Args: learning_rate: the Adam learning rate. epsilon: the Adam epsilon. """ optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4) sess.run(tf.global_variables_initializer()) """ Run an automated training loop. This is meant to provide a convenient way to run a standard training loop without any modifications. You may get more flexibility by writing your own training loop. Args: num_steps: the number of timesteps to run. player: the Player for gathering experience. replay_buffer: the ReplayBuffer for experience. optimize_op: a TF Op to optimize the model. train_interval: timesteps per training step. target_interval: number of timesteps between target network updates. batch_size: the size of experience mini-batches. min_buffer_size: minimum replay buffer size before training is performed. tf_schedules: a sequence of TFSchedules that are updated with the number of steps taken. handle_ep: called with information about every completed episode. timeout: if set, this is a number of seconds after which the training loop should exit. """ dqn.train( num_steps=1000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) elif env_id.startswith("Sonic"): env = AllowBacktracking(make_sonic_env()) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank))) if is_atari: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = WrapPyTorch(env) return env
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) # Other exploration schedules #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 10 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, tf_schedules=[eps_decay_sched], handle_ep=_handle_ep, restore_path='./pretrained_model', save_interval=None, )
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, "/root/compo/model.ckpt") #print('model restored') replay_buffer = pickle.load( gzip.open('/root/compo/replay_buffer.p.gz', 'rb')) replay_buffer.alpha = 0.2 replay_buffer.beta = 0.4 replay_buffer.capacity = 100000 restore_ppo2_weights(sess) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer= replay_buffer, #PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=4, target_interval=8192, batch_size=32, min_buffer_size=20000)
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) print("prank:", rank, "os.pid:", os.getpid()) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = AllowBacktracking( make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("Got a local env; obs space:", env.observation_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() print("player.state.shape:", player.state.shape) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: # if player.info['ale.lives'] == 0 or player.max_length: # player.eps_len = 0 state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test(args, shared_model, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) print("test proc:") env = AllowBacktracking(make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("test got env:", env.observation_space) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward """ if player.done and player.info['ale.lives'] > 0 and not player.max_length: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() """ if player.done or player.max_length: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--restore', '-restore', action='store_true', help='restore from checkpoint file') parser.add_argument('--record', '-record', action='store_true', help='record bk2 movies') args = parser.parse_args() """Run DQN until the environment throws an exception.""" env = AllowBacktracking( make_env(stack=False, scale_rew=False, record=args.record)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) checkpoint_dir = os.path.join(os.getcwd(), 'results') results_dir = os.path.join(os.getcwd(), 'results', time.strftime("%d-%m-%Y_%H-%M-%S")) if not os.path.exists(results_dir): os.makedirs(results_dir) summary_writer = tf.summary.FileWriter(results_dir) # TODO # env = wrappers.Monitor(env, results_dir, force=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) saver = tf.train.Saver() if args.restore: latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {} ...\n".format( latest_checkpoint)) saver.restore(sess, latest_checkpoint) else: print("Checkpoint not found") player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 # runs with every completed episode def _handle_ep(steps, rew): nonlocal total_steps total_steps += steps reward_hist.append(rew) summary_reward = tf.Summary() summary_reward.value.add(tag='global/reward', simple_value=rew) summary_writer.add_summary(summary_reward, global_step=total_steps) print('save model') saver.save(sess=sess, save_path=checkpoint_dir + '/model', global_step=total_steps) if len(reward_hist) == REWARD_HISTORY: print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist))) summary_meanreward = tf.Summary() summary_meanreward.value.add(tag='global/mean_reward', simple_value=sum(reward_hist) / len(reward_hist)) summary_writer.add_summary(summary_meanreward, global_step=total_steps) reward_hist.clear() dqn.train( num_steps=7000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep)
def main(): """Run DQN until the environment throws an exception.""" print('creating env') env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 print('starting tf session') with tf.Session(config=config) as sess: print('creating agent') online_net, target_net = rainbow_models(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), min_val=-200, max_val=200) dqn = DQN(online_net, target_net) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) train_steps = 5000 print('training steps:', train_steps) for j in range(1): print(j) start = time.time() dqn.train( num_steps= train_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=10000) end = time.time() print(end - start) print('done training') print('save nn') save_path = saver.save(sess, "saved_models/rainbow5.ckpt") print("Model saved in path: %s" % save_path) tvars = tf.trainable_variables() tvars_vals = sess.run(tvars) #for var, val in zip(tvars, tvars_vals): # print(var.name, val[0]) #print(tvars_vals[0][-5:]) #print('stepping') #obs = env.reset() #online_net.step(obs, obs) '''
def main(): """Run DQN until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 comm = MPI.COMM_WORLD # Use MPI for parallel evaluation rank = comm.Get_rank() size = comm.Get_size() env_fns, env_names = create_eval_envs() env = AllowBacktracking(env_fns[rank](stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 1 == 0: avg_score = sum(reward_hist[-100:]) / len(reward_hist[-100:]) # Global Score global_score = np.zeros(1) local_score = np.array(avg_score) print("Local Score for " + env_names[rank] + " at episode " + str(len(reward_hist)) + " with timesteps: " + str(total_steps) + ": " + str(local_score)) comm.Allreduce(local_score, global_score, op=MPI.SUM) global_score /= size if rank == 0: print("Global Average Score at episode: " + str(len(reward_hist)) + ": " + str(global_score)) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep, save_interval=None, restore_path= './checkpoints_rainbow/model-10' # Model to be evaluated )
from anyrl.algos import DQN from anyrl.envs import BatchedGymEnv from anyrl.envs.wrappers import BatchedFrameStack from anyrl.models import rainbow_models from anyrl.rollouts import BatchedPlayer, PrioritizedReplayBuffer, NStepPlayer from anyrl.spaces import gym_space_vectorizer, StackedBoxSpace import gym_remote.exceptions as gre from sonic_util import AllowBacktracking, make_env import numpy as np print('creating env') #z = StackedBoxSpace(np.zeros((84,84,1)), 4) env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) #print(env.action_space.n) #StackedBox(84,84,1) config = tf.ConfigProto() config.gpu_options.allow_growth = True print('starting tf session') with tf.Session(config=config) as sess: print('creating agent')
def main(): """Run DQN until the environment throws an exception.""" envs = make_envs(stack=False, scale_rew=False) for i in range(len(envs)): envs[i] = AllowBacktracking(envs[i]) envs[i] = BatchedFrameStack(BatchedGymEnv([[envs[i]]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: online_model, target_model = rainbow_models( sess, envs[0].action_space.n, gym_space_vectorizer(envs[0].observation_space), min_val=-200, max_val=200) replay_buffer = PrioritizedReplayBuffer(400000, 0.5, 0.4, epsilon=0.1) dqn = DQN(online_model, target_model) players = [] for env in envs: player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) players.append(player) optimize = dqn.optimize(learning_rate=1e-4) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): saver = tf.train.Saver([ tf.get_variable(name) for name in [ 'online/layer_1/conv2d/kernel', 'online/layer_1/conv2d/bias', 'online/layer_2/conv2d/kernel', 'online/layer_2/conv2d/bias', 'online/layer_3/conv2d/kernel', 'online/layer_3/conv2d/bias', 'target/layer_1/conv2d/kernel', 'target/layer_1/conv2d/bias', 'target/layer_2/conv2d/kernel', 'target/layer_2/conv2d/bias', 'target/layer_3/conv2d/kernel', 'target/layer_3/conv2d/bias', ] ]) # or """ sess.run(tf.variables_initializer([tf.get_variable(name) for name in [ 'online/noisy_layer/weight_mu', 'online/noisy_layer/bias_mu', 'online/noisy_layer/weight_sigma', 'online/noisy_layer/bias_sigma', 'online/noisy_layer_1/weight_mu', 'online/noisy_layer_1/bias_mu', 'online/noisy_layer_1/weight_sigma', 'online/noisy_layer_1/bias_sigma', 'online/noisy_layer_2/weight_mu', 'online/noisy_layer_2/bias_mu', 'online/noisy_layer_2/weight_sigma', 'online/noisy_layer_2/bias_sigma', 'target/noisy_layer/weight_mu', 'target/noisy_layer/bias_mu', 'target/noisy_layer/weight_sigma', 'target/noisy_layer/bias_sigma', 'target/noisy_layer_1/weight_mu', 'target/noisy_layer_1/bias_mu', 'target/noisy_layer_1/weight_sigma', 'target/noisy_layer_1/bias_sigma', 'target/noisy_layer_2/weight_mu', 'target/noisy_layer_2/bias_mu', 'target/noisy_layer_2/weight_sigma', 'target/noisy_layer_2/bias_sigma', 'beta1_power', 'beta2_power', 'online/layer_1/conv2d/kernel/Adam', 'online/layer_1/conv2d/kernel/Adam_1', 'online/layer_1/conv2d/bias/Adam', 'online/layer_1/conv2d/bias/Adam_1', 'online/layer_2/conv2d/kernel/Adam', 'online/layer_2/conv2d/kernel/Adam_1', 'online/layer_2/conv2d/bias/Adam', 'online/layer_2/conv2d/bias/Adam_1', 'online/layer_3/conv2d/kernel/Adam', 'online/layer_3/conv2d/kernel/Adam_1', 'online/layer_3/conv2d/bias/Adam', 'online/layer_3/conv2d/bias/Adam_1', 'online/noisy_layer/weight_mu/Adam', 'online/noisy_layer/weight_mu/Adam_1', 'online/noisy_layer/bias_mu/Adam', 'online/noisy_layer/bias_mu/Adam_1', 'online/noisy_layer/weight_sigma/Adam', 'online/noisy_layer/weight_sigma/Adam_1', 'online/noisy_layer/bias_sigma/Adam', 'online/noisy_layer/bias_sigma/Adam_1', 'online/noisy_layer_1/weight_mu/Adam', 'online/noisy_layer_1/weight_mu/Adam_1', 'online/noisy_layer_1/bias_mu/Adam', 'online/noisy_layer_1/bias_mu/Adam_1', 'online/noisy_layer_1/weight_sigma/Adam', 'online/noisy_layer_1/weight_sigma/Adam_1', 'online/noisy_layer_1/bias_sigma/Adam', 'online/noisy_layer_1/bias_sigma/Adam_1', 'online/noisy_layer_2/weight_mu/Adam', 'online/noisy_layer_2/weight_mu/Adam_1', 'online/noisy_layer_2/bias_mu/Adam', 'online/noisy_layer_2/bias_mu/Adam_1', 'online/noisy_layer_2/weight_sigma/Adam', 'online/noisy_layer_2/weight_sigma/Adam_1', 'online/noisy_layer_2/bias_sigma/Adam', 'online/noisy_layer_2/bias_sigma/Adam_1', ]])) """ #sess.run( tf.initialize_variables( list( tf.get_variable(name) for name in sess.run( tf.report_uninitialized_variables( tf.all_variables( ) ) ) ) ) ) sess.run(tf.global_variables_initializer()) # either saver.restore(sess, '/root/compo/model') # end either for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): print(i.name) while True: dqn.train(num_steps=16384, players=players, replay_buffer=replay_buffer, optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) saver.save(sess, '/root/compo/out/model')