def run(args): logger.configure( f'logs/{args["dataset"]}/pam/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}' ) logger.info(args) pool = mp.Pool(mp.cpu_count()) pam_arg = args.copy() if 'margin' not in pam_arg.keys(): best_margin = pool.map(find_best_margin, make_arg_list(pam_arg)) best_margin = np.mean(best_margin, 0) if 'verbose' in pam_arg.keys() and pam_arg['verbose']: for i in range(len(best_margin)): logger.record_tabular(f'[PAM] margin = {MARGINS[i]}', best_margin[i]) logger.dump_tabular() best_margin = MARGINS[best_margin.argmax()] logger.record_tabular('[PAM] best margin', best_margin) pam_arg['margin'] = best_margin results_pam = pool.map(run_pam, make_arg_list(pam_arg)) logger.record_tabular('[PAM] accuracy mean', np.mean(results_pam)) logger.record_tabular('[PAM] accuracy max', np.max(results_pam)) logger.record_tabular('[PAM] accuracy min', np.min(results_pam)) logger.record_tabular('[PAM] accuracy std', np.std(results_pam)) logger.dump_tabular()
def main(): parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') args = parser.parse_args() logger.configure() if args.policy == 'cnn': policy_fn = Convnet elif args.policy == 'lstm': policy_fn = Lstm elif args.policy == 'lnlstm': policy_fn = LnLstm num_env = 16 env = VecFrameStack(make_atari_env(args.env, num_env, args.seed), 4) fit(policy_fn, env, args.seed, total_timesteps=int(args.num_timesteps * 1.1), lrschedule=args.lrschedule) env.close()
def configure_log_info(env_name, seed): """ Configure Log Information """ cwd = os.path.join(os.getcwd(), 'log') now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique log file run_name = "{0}-{1}-{2}".format(env_name, seed, now) cwd = os.path.join(cwd, run_name) logger.configure(dir=cwd)
def main(): parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') parser.add_argument('--logdir', help='Directory for logging') args = parser.parse_args() logger.configure(args.logdir) num_cpu = 16 env = make_atari_env(args.env, num_cpu, args.seed) if args.policy == 'cnn': policy_fn = AcerConvnet elif args.policy == 'lstm': policy_fn = AcerLstm else: print("Policy {} not implemented".format(args.policy)) return fit( policy_fn, env, args.seed, total_timesteps=int(args.num_timesteps * 1.1), lrschedule=args.lrschedule ) env.close()
def run(args): logger.configure( f'logs/{args["dataset"]}/svm/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}' ) logger.info(args) pool = mp.Pool(mp.cpu_count()) svm_arg = args.copy() if 'C1' not in svm_arg.keys(): best_c1 = pool.map(find_best_c1, make_arg_list(svm_arg)) best_c1 = np.mean(best_c1, 0) if 'verbose' in svm_arg.keys() and svm_arg['verbose']: for i in range(len(best_c1)): logger.record_tabular(f'[C-SVM] C1 = {CLASS_WEIGHTS[i]}', best_c1[i]) logger.dump_tabular() best_c1 = CLASS_WEIGHTS[best_c1.argmax()] logger.record_tabular('[C-SVM] best C1', best_c1) svm_arg['C1'] = best_c1 results_svm = pool.map(run_c_svm, make_arg_list(svm_arg)) logger.record_tabular('[C-SVM] accuracy mean', np.mean(results_svm)) logger.record_tabular('[C-SVM] accuracy max', np.max(results_svm)) logger.record_tabular('[C-SVM] accuracy min', np.min(results_svm)) logger.record_tabular('[C-SVM] accuracy std', np.std(results_svm)) logger.dump_tabular()
def main(_): # Configure parameters for each run and perform training/testing if params: for key, values in params.items(): for param in values: try: # Set parameter setattr(FLAGS, key, param) # Set log directory dir = DIR_LOG + '_' + ALGO + '_' + key + '_' + param + '_' + FLAGS.level_name # Configure loggers logger.configure(dir=dir) setattr(FLAGS, 'logdir', dir) write(FLAGS.logdir) # Run algorithm with Xvfb(width=1400, height=900, colordepth=24) as xvfb: run() except Exception as e: print(e) pass else: run()
def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) logger.configure() env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) env.seed(seed) return env
def __init__(self, encoder, decoder, datasets, optimizer, logdir): self.encoder = encoder self.decoder = decoder self.datasets = datasets self.optimizer = optimizer self._create_datasets() self._create_loss() self._create_optimizer() self._create_summary() self._create_evaluation(encoder, decoder) self._create_session(logdir) logger.configure(logdir, format_strs=['stdout', 'log'])
def main(): args = pybullet_arg_parser().parse_args() logger.configure( format_strs = ['stdout', 'log', 'csv'], log_suffix = "EAC-{}-Seed_{}-sr_{}-se_{}-nbt_{}-START-" .format(args.env,args.seed,args.scale_reward,args.scale_entropy, args.num_of_train)) logger.log("Algorithm: EAC") logger.log("Environment: {}".format(args.env)) logger.log("Seed: {}".format(args.seed)) logger.log("scale-reward: {}".format(args.scale_reward)) logger.log("numberOfTrain: {}".format(args.num_of_train)) run_experiment(env = args.env, seed = args.seed, scale_reward = args.scale_reward, scale_entropy = args.scale_entropy, num_of_train = args.num_of_train)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = Monitor(env, logger.get_dir()) env = wrap_deepmind(env) model = cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) fit( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) env.close() sess = tf.get_default_session() del sess
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import ppo1.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def main(): parser = arg_parser() parser.add_argument('--platform', help='environment choice', choices=['atari', 'mujoco', 'humanoid', 'robotics'], default='atari') platform_args, environ_args = parser.parse_known_args() platform = platform_args.platform logger.configure() # atari if platform == 'atari': parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') args = parser.parse_known_args()[0] # fit( # args.env, # num_timesteps=args.num_timesteps, # seed=args.seed, # policy=args.policy # ) sess = Agent().init_session().__enter__() env = VecFrameStack(make_atari_env(args.env, 8, args.seed), 4) policy = {'cnn' : Convnet, 'lstm' : Lstm, 'lnlstm' : LnLstm, 'mlp': Mlp}[args.policy] fit( policy=policy, env=env, nsteps=128, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(args.num_timesteps * 1.1) ) sess.close() env.close() del sess # mujoco if platform == 'mujoco': args = mujoco_arg_parser().parse_known_args()[0] sess = Agent().init_session().__enter__() from utils.monitor import Monitor def make_env(): env = make_mujoco_env(args.env, args.seed) # env = gym.make(env_id) env = Monitor(env, logger.get_dir(), allow_early_resets=True) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) model = fit( policy=Mlp, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=args.num_timesteps ) # return model, env if args.play: logger.log("Running trained model") obs = np.zeros((env.num_envs,) + env.observation_space.shape) obs[:] = env.reset() while True: actions = model.step(obs)[0] obs[:] = env.step(actions)[0] env.render() sess.close() env.close() del sess
def main(_): # create visualizer #visualizer = TensorboardVisualizer() monitor = Monitor(FLAGS) #log_dir = monitor.log_dir #visualizer.initialize(log_dir, None) saved_mean_reward = None # openAI logger L.configure(monitor.log_dir, format_strs=['stdout', 'csv']) # initialize env atari_env = AtariEnv(monitor) #screen_shot_subgoal(atari_env) # we should probably follow deepmind style env # stack 4 frames and scale float env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True) # get default tf_session sess = U.get_session() # create q networks for controller controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller') controller = Controller(controller_network, env.action_space.n) # create q networks for meta-controller num_goals = env.unwrapped.goals_space.n metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller') metacontroller = MetaController(metacontroller_network, num_goals) # Create the schedule for exploration starting from 1. exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps), initial_p=1.0, final_p=EXPLORATION_FINAL_EPS) # initialize experience replay controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE) metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE) # initialize critic critic = Critic(env.unwrapped) total_extrinsic_reward = [] # for success rate total_goal_reached = np.zeros(num_goals, dtype=np.int32) total_goal_sampled = np.zeros(num_goals, dtype=np.int32) total_goal_epsilon = np.ones(num_goals, dtype=np.float32) ep = 0 total_step = 0 init_ob = env.reset() U.initialize() # initialize target network in both controller and meta sess.run(metacontroller.network.update_target_op) sess.run(controller.network.update_target_op) # load ckpt if presence model_path = tf.train.latest_checkpoint(monitor.ckpt_dir) model_saved = False model_file = os.path.join(monitor.ckpt_dir, 'model') if model_path is not None: U.load_variables(model_file) L.log('loaded model from %s' % model_file) model_saved = True while ep < MAX_EPISODE: # count number of steps # init environment game play variables init_ob = env.reset() observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape) desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 # given predicted goal, we encode this goal bounding mask to the observation np array ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal) # NOTE: Below code verify added mask correctly # for i in range(ob_with_g.shape[-1]): # ob = ob_with_g[:,:,i] # image = Image.fromarray(ob) # image = image.convert('RGB') # image.save('test_%i.png' % i) done = False reached_goal = False while not done: extrinsic_rewards = 0 s0 = init_ob['observation'] while not (done or reached_goal): update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0] # obtain extrinsic reward from environment ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t) reached_goal = env.unwrapped.reached_goal(desired_goal) ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal) intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t) controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t) # sample from replay_buffer1 to train controller obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape) q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0] td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1) # join train meta-controller only sample from replay_buffer2 to train meta-controller if total_step >= WARMUP_STEPS: L.log('join train has started ----- step %d', total_step) # sample from replay_buffer2 to train meta-controller init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape) q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0] td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1) if total_step % UPDATE_TARGET_NETWORK_FREQ == 0: #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step) sess.run(controller.network.update_target_op) # its fine, we aren't really training meta dqn until after certain steps. sess.run(metacontroller.network.update_target_op) extrinsic_rewards += extrinsic_reward_t ob_with_g = ob_with_g_tp1 done = done_t total_step += 1 # we are done / reached_goal # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2 # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards)) # clean observation without goal encoded metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done) # if we are here then we have finished the desired goal if not done: #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards)) exploration_ep = 1.0 total_goal_reached[env.unwrapped.achieved_goal] += 1 if total_step >= WARMUP_STEPS: t = total_step - WARMUP_STEPS exploration_ep = exploration2.value(t) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) while env.unwrapped.achieved_goal == desired_goal: desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal)) # start again reached_goal = False # finish an episode total_extrinsic_reward.append(extrinsic_rewards) ep += 1 mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1) if ep % monitor.print_freq == 0 : L.record_tabular("steps", total_step) L.record_tabular("episodes", ep) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if total_step % monitor.ckpt_freq == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # verified our model was saved if model_saved: L.log('restored model with mean reward: %d' % saved_mean_reward) U.load_variables(model_file)
def run(args): prefix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") logger.configure(f'logs/{args["dataset"]}/nn/{prefix}') logger.info(args) pool = mp.Pool(mp.cpu_count()) nn_arg = args.copy() nn_arg.update(find_best_params(nn_arg)) nn_arg.update(find_best_alpha_val(nn_arg)) logger.record_tabular('[PEER] batchsize', nn_arg['batchsize']) logger.record_tabular('[PEER] learning rate', nn_arg['lr']) logger.record_tabular('[PEER] hidsize', nn_arg['hidsize']) logger.record_tabular('[PEER] alpha', nn_arg['alpha']) logger.dump_tabular() nn_arg['seed'] = 1 run_nn_dmi(nn_arg) results_dmi = pool.map(run_nn_dmi, make_arg_list(nn_arg)) results_surr = pool.map(run_nn_surr, make_arg_list(nn_arg)) results_nn = pool.map(run_nn, make_arg_list(nn_arg)) results_peer = pool.map(run_nn_peer, make_arg_list(nn_arg)) results_symm = pool.map(run_nn_symm, make_arg_list(nn_arg)) pool.close() pool.join() test_acc_bce = [res['val_acc'] for res in results_nn] test_acc_peer = [res['val_acc'] for res in results_peer] test_acc_surr = [res['val_acc'] for res in results_surr] test_acc_symm = [res['val_acc'] for res in results_symm] test_acc_dmi = [res['val_acc'] for res in results_dmi] plot([ test_acc_bce, test_acc_peer, test_acc_surr, test_acc_symm, test_acc_dmi ], [ 'cross entropy loss', 'peer loss', 'surrogate loss', 'symmtric loss', 'dmi loss' ], title='Accuracy During Testing', path=f'logs/{args["dataset"]}/nn/{prefix}') train_acc_bce = [res['train_acc'] for res in results_nn] train_acc_peer = [res['train_acc'] for res in results_peer] train_acc_surr = [res['train_acc'] for res in results_surr] train_acc_symm = [res['train_acc'] for res in results_symm] train_acc_dmi = [res['train_acc'] for res in results_dmi] plot([ train_acc_bce, train_acc_peer, train_acc_surr, train_acc_symm, train_acc_dmi ], [ 'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss', 'dmi loss' ], title='Accuracy During Training', path=f'logs/{args["dataset"]}/nn/{prefix}') loss_acc_surr = [res['loss'] for res in results_surr] loss_acc_bce = [res['loss'] for res in results_nn] loss_acc_peer = [res['loss'] for res in results_peer] loss_acc_symm = [res['loss'] for res in results_symm] loss_acc_dmi = [res['loss'] for res in results_dmi] plot([ loss_acc_bce, loss_acc_peer, loss_acc_surr, loss_acc_symm, loss_acc_dmi ], [ 'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss', 'dmi loss' ], title='Loss', path=f'logs/{args["dataset"]}/nn/{prefix}') logger.record_tabular('[NN] with peer loss', np.mean(test_acc_peer, 0)[-1]) logger.record_tabular('[NN] with surrogate loss', np.mean(test_acc_surr, 0)[-1]) logger.record_tabular('[NN] with symmetric loss', np.mean(test_acc_symm, 0)[-1]) logger.record_tabular('[NN] with dmi loss', np.mean(test_acc_dmi, 0)[-1]) logger.record_tabular(f'[NN] with {args["loss"]} loss', np.mean(test_acc_bce, 0)[-1]) logger.dump_tabular()
def main(): # Parse input parameters args, unknown_args = parser.parse_known_args() args.num_steps = int(args.num_steps) unknown_args = parse_cmdline_kwargs(unknown_args) # Load config file load_yaml_config(args, 'learner') # Expose socket to actor(s) context = zmq.Context() weights_socket = context.socket(zmq.PUB) weights_socket.bind(f'tcp://*:{args.param_port}') _, agent = init_components(args, unknown_args) # Configure experiment directory create_experiment_dir(args, 'LEARNER-') save_yaml_config(args.exp_path / 'config.yaml', args, 'learner', agent) args.log_path = args.exp_path / 'log' args.ckpt_path = args.exp_path / 'ckpt' args.ckpt_path.mkdir() args.log_path.mkdir() logger.configure(str(args.log_path)) # Record commit hash with open(args.exp_path / 'hash', 'w') as f: f.write( str( subprocess.run('git rev-parse HEAD'.split(), stdout=subprocess.PIPE).stdout.decode('utf-8'))) # Variables to control the frequency of training receiving_condition = multiprocessing.Condition() num_receptions = multiprocessing.Value('i', 0) # Start memory pool in another process manager = MemPoolManager() manager.start() mem_pool = manager.MemPool(capacity=args.pool_size) Process(target=recv_data, args=(args.data_port, mem_pool, receiving_condition, num_receptions, args.keep_training)).start() # Print throughput statistics Process(target=MultiprocessingMemPool.record_throughput, args=(mem_pool, args.record_throughput_interval)).start() freq = 0 learn_flag = 0 while True: if learn_flag == 0: weights_socket.send(pickle.dumps(agent.get_weights())) if len(mem_pool) >= args.batch_size: # Sync weights to actor weights = agent.get_weights() if hvd.rank() == 0: weights_socket.send(pickle.dumps(weights)) if freq % args.ckpt_save_freq == 0: if args.ckpt_save_type == 'checkpoint': agent.save(args.ckpt_path / 'ckpt') elif args.ckpt_save_type == 'weight': with open(args.ckpt_path / 'weight.ckpt', 'wb') as f: pickle.dump(weights, f) if args.keep_training: agent.learn(mem_pool.sample(size=args.batch_size)) else: with receiving_condition: while num_receptions.value < args.training_freq: receiving_condition.wait() data = mem_pool.sample(size=args.batch_size) num_receptions.value -= args.training_freq # Training stat = agent.learn(data) learn_flag = 1 if stat is not None: for k, v in stat.items(): logger.record_tabular(k, v) logger.dump_tabular() freq += 1
avg_ret.append(avg_reward) return avg_ret, avg_pg_loss, avg_vf_loss, avg_latencies if __name__ == "__main__": from env.mec_offloaing_envs.offloading_env import Resources from env.mec_offloaing_envs.offloading_env import OffloadingEnvironment from policies.meta_seq2seq_policy import Seq2SeqPolicy from samplers.seq2seq_sampler import Seq2SeqSampler from samplers.seq2seq_sampler_process import Seq2SeSamplerProcessor from baselines.vf_baseline import ValueFunctionBaseline from meta_algos.ppo_offloading import PPO from utils import utils, logger logger.configure(dir="./meta_evaluate_ppo_log/task_offloading", format_strs=['stdout', 'log', 'csv']) resource_cluster = Resources(mec_process_capable=(10.0 * 1024 * 1024), mobile_process_capable=(1.0 * 1024 * 1024), bandwidth_up=7.0, bandwidth_dl=7.0) env = OffloadingEnvironment( resource_cluster=resource_cluster, batch_size=100, graph_number=100, graph_file_paths=[ "./env/mec_offloaing_envs/data/meta_offloading_20/offload_random20_12/random.20." ], time_major=False)
def main(): parser = arg_parser() parser.add_argument('--platform', help='environment choice', choices=['atari', 'mujoco', 'humanoid', 'robotics'], default='atari') platform_args, environ_args = parser.parse_known_args() platform = platform_args.platform # atari if platform == 'atari': args = atari_arg_parser().parse_known_args()[0] pi = fit(platform, args.env, num_timesteps=args.num_timesteps, seed=args.seed) # mujoco if platform == 'mujoco': args = mujoco_arg_parser().parse_known_args()[0] logger.configure() pi = fit(platform, args.env, num_timesteps=args.num_timesteps, seed=args.seed) # robotics if platform == 'robotics': args = robotics_arg_parser().parse_known_args()[0] pi = fit(platform, args.env, num_timesteps=args.num_timesteps, seed=args.seed) # humanoids if platform == 'humanoid': logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_known_args()[0] if not args.play: # train the model pi = fit(platform, args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render from utils.cmd import make_mujoco_env pi = fit(platform, args.evn, num_timesteps=1, seed=args.seed) Model().load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() if done: ob = env.reset()
def train(): processes = [] if os.path.isdir(args.log_dir): ans = input('{} exists\ncontinue and overwrite? y/n: '.format(args.log_dir)) if ans == 'n': return logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join(args.log_dir, 'params.json'), 'w')) torch.set_num_threads(2) start = time.time() policy_update_time, policy_forward_time = 0, 0 step_time_env, step_time_total, step_time_rewarder = 0, 0, 0 visualize_time = 0 rewarder_fit_time = 0 envs = RL2EnvInterface(args) if args.look: looker = Looker(args.log_dir) actor_critic = Policy(envs.obs_shape, envs.action_space, base=RL2Base, base_kwargs={'recurrent': True, 'num_act_dim': envs.action_space.shape[0]}) actor_critic.to(args.device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.obs_shape, envs.action_space, actor_critic.recurrent_hidden_state_size) rollouts.to(args.device) def copy_obs_into_beginning_of_storage(obs): obs_raw, obs_act, obs_rew, obs_flag = obs rollouts.obs[0].copy_(obs_raw) rollouts.obs_act[0].copy_(obs_act) rollouts.obs_rew[0].copy_(obs_rew) rollouts.obs_flag[0].copy_(obs_flag) for j in range(args.num_updates): obs = envs.reset() copy_obs_into_beginning_of_storage(obs) if args.use_linear_lr_decay: update_linear_schedule(agent.optimizer, j, args.num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(args.num_updates)) episode_returns = [0 for i in range(args.trial_length)] episode_final_reward = [0 for i in range(args.trial_length)] i_episode = 0 log_marginal = 0 lambda_log_s_given_z = 0 for step in range(args.num_steps): # Sample actions policy_forward_start = time.time() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) policy_forward_time += time.time() - policy_forward_start # Obser reward and next obs step_total_start = time.time() obs, reward, done, info = envs.step(action) step_time_total += time.time() - step_total_start step_time_env += info['step_time_env'] step_time_rewarder += info['reward_time'] log_marginal += info['log_marginal'].sum().item() lambda_log_s_given_z += info['lambda_log_s_given_z'].sum().item() episode_returns[i_episode] += reward.sum().item() if all(done['episode']): episode_final_reward[i_episode] += reward.sum().item() i_episode = (i_episode + 1) % args.trial_length # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done['trial']]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) assert all(done['trial']) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.get_obs(-1), rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) policy_update_start = time.time() if args.rewarder != 'supervised' and envs.rewarder.fit_counter == 0 and not args.vae_load: value_loss, action_loss, dist_entropy = 0, 0, 0 else: value_loss, action_loss, dist_entropy = agent.update(rollouts) policy_update_time += time.time() - policy_update_start rollouts.after_update() # metrics trajectories_pre = envs.trajectories_pre_current_update state_entropy_pre = calculate_state_entropy(args, trajectories_pre) trajectories_post = envs.trajectories_post_current_update state_entropy_post = calculate_state_entropy(args, trajectories_post) return_avg = rollouts.rewards.sum() / args.trials_per_update reward_avg = return_avg / (args.trial_length * args.episode_length) log_marginal_avg = log_marginal / args.trials_per_update / (args.trial_length * args.episode_length) lambda_log_s_given_z_avg = lambda_log_s_given_z / args.trials_per_update / (args.trial_length * args.episode_length) num_steps = (j + 1) * args.num_steps * args.num_processes num_episodes = num_steps // args.episode_length num_trials = num_episodes // args.trial_length logger.logkv('state_entropy_pre', state_entropy_pre) logger.logkv('state_entropy_post', state_entropy_post) logger.logkv('value_loss', value_loss) logger.logkv('action_loss', action_loss) logger.logkv('dist_entropy', dist_entropy) logger.logkv('return_avg', return_avg.item()) logger.logkv('reward_avg', reward_avg.item()) logger.logkv('steps', (j + 1) * args.num_steps * args.num_processes) logger.logkv('episodes', num_episodes) logger.logkv('trials', num_trials) logger.logkv('policy_updates', (j + 1)) logger.logkv('time', time.time() - start) logger.logkv('policy_forward_time', policy_forward_time) logger.logkv('policy_update_time', policy_update_time) logger.logkv('step_time_rewarder', step_time_rewarder) logger.logkv('step_time_env', step_time_env) logger.logkv('step_time_total', step_time_total) logger.logkv('visualize_time', visualize_time) logger.logkv('rewarder_fit_time', rewarder_fit_time) logger.logkv('log_marginal_avg', log_marginal_avg) logger.logkv('lambda_log_s_given_z_avg', lambda_log_s_given_z_avg) for i_episode in range(args.trial_length): logger.logkv('episode_return_avg_{}'.format(i_episode), episode_returns[i_episode] / args.trials_per_update) logger.logkv('episode_final_reward_{}'.format(i_episode), episode_final_reward[i_episode] / args.trials_per_update) if (j % args.save_period == 0 or j == args.num_updates - 1) and args.log_dir != '': save_model(args, actor_critic, envs, iteration=j) if not args.vae_freeze and j % args.rewarder_fit_period == 0: rewarder_fit_start = time.time() envs.fit_rewarder() rewarder_fit_time += time.time() - rewarder_fit_start if (j % args.vis_period == 0 or j == args.num_updates - 1) and args.log_dir != '': visualize_start = time.time() if args.look: eval_return_avg, eval_episode_returns, eval_episode_final_reward = looker.look(iteration=j) logger.logkv('eval_return_avg', eval_return_avg) for i_episode in range(args.trial_length): logger.logkv('eval_episode_return_avg_{}'.format(i_episode), eval_episode_returns[i_episode] / args.trials_per_update) logger.logkv('eval_episode_final_reward_{}'.format(i_episode), eval_episode_final_reward[i_episode] / args.trials_per_update) if args.plot: p = Popen('python visualize.py --log-dir {}'.format(args.log_dir), shell=True) processes.append(p) visualize_time += time.time() - visualize_start logger.dumpkvs()
def run_one_agent(index, args, unknown_args, actor_status): from tensorflow.keras.backend import set_session import tensorflow.compat.v1 as tf # Set 'allow_growth' config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # Connect to learner context = zmq.Context() context.linger = 0 # For removing linger behavior socket = context.socket(zmq.REQ) socket.connect(f'tcp://{args.ip}:{args.data_port}') # Initialize environment and agent instance env, agent = init_components(args, unknown_args) # Configure logging only in one process if index == 0: logger.configure(str(args.log_path)) save_yaml_config(args.exp_path / 'config.yaml', args, 'actor', agent) else: logger.configure(str(args.log_path), format_strs=[]) # Create local queues for collecting data transitions = [] # A list to store raw transitions within an episode mem_pool = MemPool() # A pool to store prepared training data # Initialize values model_id = -1 episode_rewards = [0.0] episode_lengths = [0] num_episodes = 0 mean_10ep_reward = 0 mean_10ep_length = 0 send_time_start = time.time() state = env.reset() for step in range(args.num_steps): # Do some updates agent.update_sampling(step, args.num_steps) # Sample action action, extra_data = agent.sample(state) next_state, reward, done, info = env.step(action) # Record current transition transitions.append( (state, action, reward, next_state, done, extra_data)) episode_rewards[-1] += reward episode_lengths[-1] += 1 state = next_state is_terminal = done or episode_lengths[-1] >= args.max_episode_length > 0 if is_terminal or len(mem_pool) + len( transitions) >= args.max_steps_per_update: # Current episode is terminated or a trajectory of enough training data is collected data = agent.prepare_training_data(transitions) transitions.clear() mem_pool.push(data) if is_terminal: # Log information at the end of episode num_episodes = len(episode_rewards) mean_10ep_reward = round(np.mean(episode_rewards[-10:]), 2) mean_10ep_length = round(np.mean(episode_lengths[-10:]), 2) episode_rewards.append(0.0) episode_lengths.append(0) # Reset environment state = env.reset() if len(mem_pool) >= args.max_steps_per_update: # Send training data after enough training data (>= 'arg.max_steps_per_update') is collected post_processed_data = agent.post_process_training_data( mem_pool.sample()) socket.send(serialize(post_processed_data).to_buffer()) socket.recv() mem_pool.clear() send_data_interval = time.time() - send_time_start send_time_start = time.time() if num_episodes > 0: # Log information logger.record_tabular("iteration", (step + 1) // args.max_steps_per_update) logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean 10 episode reward", mean_10ep_reward) logger.record_tabular("mean 10 episode length", mean_10ep_length) logger.record_tabular( "send data fps", args.max_steps_per_update // send_data_interval) logger.record_tabular("send data interval", send_data_interval) logger.dump_tabular() # Update weights new_weights, model_id = find_new_weights(model_id, args.ckpt_path) if new_weights is not None: agent.set_weights(new_weights) actor_status[index] = 1
def fit(environ, env_id, num_timesteps, seed, model_path=None): # atari if environ == 'atari': rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed \ is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): return PPO1Cnn(name=name, ob_space=ob_space, ac_space=ac_space) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pi = PPOSGD(env, policy_fn, env.observation_space, env.action_space, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, max_timesteps=int(num_timesteps * 1.1), schedule='linear') env.close() sess.close() return pi # mujoco if environ == 'mujoco': from utils.cmd import make_mujoco_env sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi if environ == 'humanoid': import gym from utils.cmd import make_mujoco_env env_id = 'Humanoid-v2' class RewScale(gym.RewardWrapper): def __init__(self, env, scale): gym.RewardWrapper.__init__(self, env) self.scale = scale def reward(self, r): return r * self.scale sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random # search these are good enough to make humanoid walk, but # whether those are an absolute best or not is not certain env = RewScale(env, 0.1) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: Model().save_state(model_path) sess.close() return pi if environ == 'robotics': import mujoco_py from utils.cmd import make_robotics_env rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0', 'Ant-v0', 'HalfCheetah-v0' ]) writer = SummaryWriter(log_dir=args.log_dir) logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join( args.log_dir, 'params.json', ), 'w'), indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) logger.logkv('return_avg_pre', total_rewards([ep.rewards for ep, _ in episodes])) logger.logkv('return_avg_post', total_rewards([ep.rewards for _, ep in episodes])) logger.dumpkvs()