def slideturn_noisy_rec(val=0.1, directory="./Results/Car/NoisyObsRec500/", exp_name="Cap_", save=False): policies = [ PolicyLoader("models/slideturn_experiment/" + path) for path in ['agent0', 'agent1'] ] rccar = RCCarSlideTurn(noise=0.) # remove process noise domain = RLPyEnv(rccar) original_env = HRLEnv(domain, policies) env = NoisyObservationEnv(original_env, obs_noise=val) policy = CategoricalGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) dir_name = os.path.join(directory, exp_name) for i in range(1): now = datetime.datetime.now() timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=500, discount=0.9, step_size=0.01, # plot=True, ) # algo.train() # rollout(env, policy) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", script="scripts/run_experiment_lite_rl.py", exp_name=exp_name + timestamp, log_dir=os.path.join(dir_name, timestamp) if save else './Results/Tmp2', # Specifies the seed for the experiment. If this is not provided, a random seed # will be used # plot=True, )
def train(self, gym_env, n_eps=10): env = MyGymEnv(gym_env) policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=32, state_include_action=False) self.raw_policy = LoggedTRPO( env=env, policy=policy, baseline=LinearFeatureBaseline(env_spec=env.spec), batch_size=4000, max_path_length=env.env.n_steps, n_itr=n_eps, discount=0.99, step_size=0.01, verbose=False) self.raw_policy.train() return self.raw_policy.rew_chkpts
'num_questions': NumQ, 'num_concepts': Concepts, 'candidate_exercises': candidate_exercises } logging.info("") logging.info("Broj vjezbi kandidata: " + str(len(candidate_exercises))) logging.info("Broj epoha: " + str(n_eps)) logging.info("Broj koraka: " + str(n_steps)) env = DKVEnv(**env_kwargs, reward_func='likelihood') rl_env = MyGymEnv(make_rl_student_env(env)) policy = CategoricalGRUPolicy(env_spec=rl_env.spec, hidden_dim=32, state_include_action=False) raw_policy = LoggedTRPO(env=rl_env, policy=policy, baseline=LinearFeatureBaseline(env_spec=rl_env.spec), batch_size=4000, max_path_length=rl_env.env.n_steps, n_itr=n_eps, discount=0.99, step_size=0.01, verbose=False) agent = RLTutor(rl_env=rl_env, raw_policy=raw_policy) reward = agent.train() print(evaluation(agent))
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.99) parser.add_argument('--gae_lambda', type=float, default=1.0) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--rectangle', type=str, default='10,10') parser.add_argument('--map_type', type=str, default='rectangle') parser.add_argument('--n_evaders', type=int, default=5) parser.add_argument('--n_pursuers', type=int, default=2) parser.add_argument('--obs_range', type=int, default=3) parser.add_argument('--n_catch', type=int, default=2) parser.add_argument('--urgency', type=float, default=0.0) parser.add_argument('--pursuit', dest='train_pursuit', action='store_true') parser.add_argument('--evade', dest='train_pursuit', action='store_false') parser.set_defaults(train_pursuit=True) parser.add_argument('--surround', action='store_true', default=False) parser.add_argument('--constraint_window', type=float, default=1.0) parser.add_argument('--sample_maps', action='store_true', default=False) parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy') parser.add_argument('--flatten', action='store_true', default=False) parser.add_argument('--reward_mech', type=str, default='global') parser.add_argument('--catchr', type=float, default=0.1) parser.add_argument('--term_pursuit', type=float, default=5.0) parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--conv', action='store_true', default=False) parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) if args.sample_maps: map_pool = np.load(args.map_file) else: if args.map_type == 'rectangle': env_map = TwoDMaps.rectangle_map( *map(int, args.rectangle.split(','))) elif args.map_type == 'complex': env_map = TwoDMaps.complex_map( *map(int, args.rectangle.split(','))) else: raise NotImplementedError() map_pool = [env_map] env = PursuitEvade(map_pool, n_evaders=args.n_evaders, n_pursuers=args.n_pursuers, obs_range=args.obs_range, n_catch=args.n_catch, train_pursuit=args.train_pursuit, urgency_reward=args.urgency, surround=args.surround, sample_maps=args.sample_maps, constraint_window=args.constraint_window, flatten=args.flatten, reward_mech=args.reward_mech, catchr=args.catchr, term_pursuit=args.term_pursuit) env = RLLabEnv(StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=False), mode=args.control) if args.recurrent: if args.conv: feature_network = ConvNetwork( input_shape=emv.spec.observation_space.shape, output_dim=5, conv_filters=(8, 16, 16), conv_filter_sizes=(3, 3, 3), conv_strides=(1, 1, 1), conv_pads=('VALID', 'VALID', 'VALID'), hidden_sizes=(64, ), hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax) else: feature_network = MLP( input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=5, hidden_sizes=(128, 128, 128), hidden_nonlinearity=NL.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = CategoricalGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes)) elif args.conv: feature_network = ConvNetwork( input_shape=env.spec.observation_space.shape, output_dim=5, conv_filters=(8, 16, 16), conv_filter_sizes=(3, 3, 3), conv_strides=(1, 1, 1), conv_pads=('valid', 'valid', 'valid'), hidden_sizes=(64, ), hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax) policy = CategoricalMLPPolicy(env_spec=env.spec, prob_network=feature_network) else: policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(obsfeat_space) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, mode=args.control, ) algo.train()
def do_pixelworld_experiment(dataset=default_dataset, visualize_policy=True, load_policy=False, policy_filename='policy100.pkl'): """Train by reinforcement learning in a pixelworld training environment, optionally visualizing the policy in test and training environment. Parameters ---------- dataset : str, optional the name of the dataset from which to generate the environment, or an actual dataset visualize_policy : bool, optional whether to vizualize the policy acting within test/train environments load_policy : bool, optional whether to load a saved policy from disk policy_filename : str, optional filename to store/load policy to/from """ train_env, test_env = get_envs(dataset, rllab_env=True) if load_policy: policy = joblib.load(policy_filename) else: # Train policy in train_env policy = CategoricalGRUPolicy(hidden_sizes=[32], env_spec=train_env.spec) baseline = LinearFeatureBaseline(env_spec=train_env.spec) algo = NPO( env=train_env, policy=policy, baseline=baseline, max_path_length=10000, whole_paths=True, n_itr=100, # 100 batch_size=2000) np.random.seed(137) algo.train() joblib.dump(policy, policy_filename) # Visual trained policy by rollouts in test and training environments if visualize_policy: delay = 0.5 # 0.1 2 num_envs = 10 # 100 for env_name, env in [('train', train_env), ('test', test_env)]: print() num_positive = 0 tot_tot_r = 0.0 for seed in range(num_envs): print(env_name, 'rollout', seed) np.random.seed(seed) policy.reset() o = env.reset() d = False tot_r = 0.0 env.render() time.sleep(delay) while not d: a, info = policy.get_action(o) o, r, d, env_info = env.step(a) tot_r += r print(' step(%s) -> reward %s' % (a, r)) env.render() time.sleep(delay) if a == 1: # SIG1 positive = r > 0 else: positive = r < 0 num_positive += positive print(env_name, 'rollout done (%s, %s)' % (tot_r, positive)) tot_tot_r += tot_r print(env_name, 'avg tot r', tot_tot_r / num_envs) print(env_name, 'avg positive', num_positive / float(num_envs))