def init(): global RENDER, S, R, S_, env, state_dim, action_dim, action_bound, actor, critic env = Env(np.ones([ 39, ]) * 1e-5) state_dim = env.observation_space() action_dim = env.action_space() action_bound = env.action_bound() with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'): S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_') sess = tf.Session() actor = Actor(sess, action_dim, action_bound, LR_A, REPLACE_ITER_A) critic = Critic(sess, state_dim, action_dim, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_) actor.add_grad_to_graph(critic.a_grads) sess.run(tf.global_variables_initializer())
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() args.device = torch.device("cuda" if args.cuda else "cpu") logger.debug(args) torch.manual_seed(random.randint(1, 10000)) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True torch.backends.cudnn.enabled = False # Disable nondeterministic ops (not sure if critical but better safe than sorry) if not os.path.exists(args.save): os.makedirs(args.save) split_files = os.path.join(args.data, args.split_file) dataset_file = os.path.join(args.data, 'dataset.pth') if os.path.isfile(dataset_file): train_dataset = torch.load(dataset_file) else: train_dataset = Dataset(split_files, args.in_dim) torch.save(train_dataset, dataset_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) # initialize environment, agent, memory env = Env(args, train_dataset) action_space = env.action_space() dqn = Agent(args, action_space) mem = ReplayMemory(args, args.memory_capacity) # create trainer object for training and testing trainer = Trainer(args, env, dqn, mem) if args.evaluate: # Evaluate step dqn.eval() for _ in range(args.evaluation_episodes): trainer.evaluate_one_step(train_dataset) else: # Training step trainer.train(train_dataset, logger) logger.debug('==> Checkpointing everything now...') dqn.save(os.path.join(args.save, args.expname))
torch.cuda.set_device(args.device) torch.cuda.manual_seed(random.randint(1, 10000)) torch.backends.cudnn.enabled = False # Disable nondeterministic ops (not sure if critical but better safe than sorry) else: args.device = torch.device('cpu') # Simple ISO 8601 timestamped logger def log(s): print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s) # Environment env = Env(args) env.train() action_space = env.action_space() # Agent dqn = Agent(args, env) mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state, done = env.reset(), False next_state, _, done = env.step(random.randint(0, action_space - 1))
def main(): args = parse_arguments() results_dir = os.path.join('results', args.id) os.makedirs(results_dir, exist_ok=True) logger = Logger(results_dir) metrics = { 'steps': [], 'rewards': [], 'Qs': [], 'best_avg_reward': -float('inf') } np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) torch.backends.cudnn.enabled = args.enable_cudnn else: args.device = torch.device('cpu') if args.tensorboard_dir is None: writer = SummaryWriter( os.path.join(results_dir, 'tensorboard', args.game, args.architecture)) else: writer = SummaryWriter( os.path.join(args.tensorboard_dir, args.game, args.architecture)) # Environment env = Env(args) env.train() action_space = env.action_space() # Agent dqn = Agent(args, env) # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory if args.model is not None and not args.evaluate: if not args.memory: raise ValueError( 'Cannot resume training without memory save path. Aborting...') elif not os.path.exists(args.memory): raise ValueError( 'Could not find memory file at {path}. Aborting...'.format( path=args.memory)) mem = load_memory(args.memory, args.disable_bzip_memory) else: mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state, done = env.reset(), False next_state, _, done = env.step(np.random.randint(0, action_space)) val_mem.append(state, None, None, done) state = next_state T += 1 if args.evaluate: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, evaluate=True) # Test logger.info('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() T, done = 0, True accumulate_reward = 0 for T in trange(1, args.T_max + 1): if done: state, done = env.reset(), False writer.add_scalar('Train/Reward', accumulate_reward, T) accumulate_reward = 0 if T % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights action = dqn.act( state) # Choose an action greedily (with noisy weights) next_state, reward, done = env.step(action) # Step accumulate_reward += reward if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) # Clip rewards mem.append(state, action, reward, done) # Append transition to memory # Train and test if T >= args.learn_start: mem.priority_weight = min( mem.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 if T % args.replay_frequency == 0: dqn.learn( mem ) # Train with n-step distributional double-Q learning if T % args.evaluation_interval == 0: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics, results_dir) # Test writer.add_scalar('Eval/Reward', avg_reward, T) writer.add_scalar('Eval/Q', avg_Q, T) logger.info('T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn.train( ) # Set DQN (online network) back to training mode # If memory path provided, save it if args.memory is not None: save_memory(mem, args.memory, args.disable_bzip_memory) # Update target network if T % args.target_update == 0: dqn.update_target_net() # Checkpoint the network if (args.checkpoint_interval != 0) and (T % args.checkpoint_interval == 0): dqn.save(results_dir, 'checkpoint.pth') state = next_state env.close()
import sys import random import numpy as np from env import Env games = ['atlantis', 'breakout', 'pong', 'space_invaders', 'kung_fu_master', 'boxing', 'seaquest', 'chopper_command'] for game in games: env = Env(game, 1234, 'cuda', 600, 4, False) acts = env.action_space() - 1 all_rews = [] for i in range(100): env.reset() done = False rew = 0 while not done: _, r, done, _ = env.step(random.randint(0, acts)) rew += r all_rews.append(rew) print('Ep: ', i, end='\r') print(game, np.mean(all_rews))
def ensemble_test(args, T, dqn, val_mem, metrics, results_dir, num_ensemble, evaluate=False): env = Env(args) env.eval() metrics['steps'].append(T) T_rewards, T_Qs = [], [] action_space = env.action_space() # Test performance over several episodes done = True for _ in range(args.evaluation_episodes): while True: if done: state, reward_sum, done = env.reset(), 0, False q_tot = 0 for en_index in range(num_ensemble): if en_index == 0: q_tot = dqn[en_index].ensemble_q(state) else: q_tot += dqn[en_index].ensemble_q(state) action = q_tot.argmax(1).item() state, reward, done = env.step(action) # Step reward_sum += reward if args.render: env.render() if done: T_rewards.append(reward_sum) break env.close() # Test Q-values over validation memory for state in val_mem: # Iterate over valid states for en_index in range(num_ensemble): T_Qs.append(dqn[en_index].evaluate_q(state)) avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) if not evaluate: # Save model parameters if improved if avg_reward > metrics['best_avg_reward']: metrics['best_avg_reward'] = avg_reward for en_index in range(num_ensemble): dqn[en_index].save(results_dir, name='%dth_model.pth' % (en_index)) # Append to results and save metrics metrics['rewards'].append(T_rewards) metrics['Qs'].append(T_Qs) torch.save(metrics, os.path.join(results_dir, 'metrics.pth')) # Plot _plot_line(metrics['steps'], metrics['rewards'], 'Reward', path=results_dir) _plot_line(metrics['steps'], metrics['Qs'], 'Q', path=results_dir) # Return average reward and Q-value return avg_reward, avg_Q