예제 #1
0
def find_deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories(expert_trajectories, learner_policy, limit_trajs, data_subsamp_freq, ipython_after_eval):
	# Load the learner's policy
	policy_file, policy_key = util.split_h5_name(learner_policy)
	print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
	with h5py.File(policy_file, 'r') as f:
	    train_args = json.loads(f.attrs['args'])
	    dset = f[policy_key]
	    import pprint
	    pprint.pprint(dict(dset.attrs))

	# Initialize the MDP
	env_name = train_args['env_name']
	print 'Loading environment', env_name
	mdp = rlgymenv.RLGymMDP(env_name)
	util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

	# Initialize the policy and load its parameters
	enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
	if isinstance(mdp.action_space, policyopt.ContinuousSpace):
	    policy_cfg = rl.GaussianPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        min_stdev=0.,
	        init_logstdev=0.,
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
	else:
	    policy_cfg = rl.GibbsPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

	policy.load_h5(policy_file, policy_key)

	# Load the expert trajectories
	exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = imitate_mj.load_dataset(
	    expert_trajectories, limit_trajs, data_subsamp_freq)
	assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
	assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
	assert ext_Bstacked.ndim == 1



	# Generate the actions according to the learner's policy for the expert's observations
	learner_actions_Bstacked_Da = policy.sample_actions(exobs_Bstacked_Do)[0]

	# Calcualating the deviation histogram:
	action_deviations = np.linalg.norm(exa_Bstacked_Da - learner_actions_Bstacked_Da, axis=1)

	# Plot the histogram
	# sns.kdeplot(action_deviations,shade=True)

	# FIXME: Uncomment the following
	plt.figure()
	plt.hist(action_deviations, bins=100)
	plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories.png')
	plt.show()	

	if ipython_after_eval:
		import IPython; IPython.embed()    
예제 #2
0
def find_deviation_of_agent_actions_from_expert_actions_for_underperforming_trajectories(learner_trajectories, expert_policy, lower_bound_reward, ipython_after_eval, generate_plot):
	obs,a,r,l = find_underperforming_trajectories(learner_trajectories, lower_bound_reward)
	print(type(obs))
	# Load the expert's policy
	policy_file, policy_key = util.split_h5_name(expert_policy)
	print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
	with h5py.File(policy_file, 'r') as f:
	    train_args = json.loads(f.attrs['args'])
	    dset = f[policy_key]
	    import pprint
	    pprint.pprint(dict(dset.attrs))

	# Initialize the MDP
	env_name = train_args['env_name']
	print 'Loading environment', env_name
	mdp = rlgymenv.RLGymMDP(env_name)
	util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

	# Initialize the policy and load its parameters
	enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
	if isinstance(mdp.action_space, policyopt.ContinuousSpace):
	    policy_cfg = rl.GaussianPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        min_stdev=0.,
	        init_logstdev=0.,
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
	else:
	    policy_cfg = rl.GibbsPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

	policy.load_h5(policy_file, policy_key)
	
	# Generate the actions according to the expert's policy for the observations in the underperforming trajs

	expert_actions = policy.sample_actions(obs.reshape((-1,obs.shape[-1])))[0].reshape((-1,a.shape[1],a.shape[2]))
	

	# Calcualating the deviation histogram:
	action_deviations = np.linalg.norm(expert_actions.reshape((-1,a.shape[-1])) - a.reshape((-1,a.shape[-1])), axis=1)
	if generate_plot:
		plt.figure()
		plt.hist(action_deviations, bins=100)
		plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_underperforming_learner_trajectories.png')
		plt.show()	
	if ipython_after_eval:
		import IPython; IPython.embed() 
예제 #3
0
def load_trained_policy_and_mdp(env_name, policy_state_str):
    """ Creates the specialized MDP and policy objects needed to sample expert
    trajectories for a given environment.

    Returns:
        mdp: An instance of `RLGymMDP`, similar to a real gym env except with
            customized obs/action spaces and an internal `RLGyMSim` object.
        policy: The agent's policy, encoded as either rl.GaussianPolicy for
            continuous actions, or rl.GibbsPolicy for discrete actions.
        train_args: A dictionary of arguments (like argparse dicts) based on the
            trained policy's TRPO run.
    """
    import gym
    import policyopt
    from policyopt import nn, rl
    from environments import rlgymenv

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(policy_state_str)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])

    # Initialize the MDP
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    print 'MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)

    # Initialize the policy
    nn.reset_global_scope()
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    # Load the policy parameters
    policy.load_h5(policy_file, policy_key)

    return mdp, policy, train_args
예제 #4
0
def load_trained_policy_and_mdp(env_name, policy_state_str):
    import gym
    import policyopt
    from policyopt import nn, rl
    from environments import rlgymenv

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(policy_state_str)
    print('Loading policy parameters from %s in %s' %
          (policy_key, policy_file))
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])

    # Initialize the MDP
    print('Loading environment', env_name)
    mdp = rlgymenv.RLGymMDP(env_name)
    print('MDP observation space, action space sizes: %d, %d\n' %
          (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    nn.reset_global_scope()
    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')

    # Load the policy parameters
    policy.load_h5(policy_file, policy_key)

    return mdp, policy, train_args
예제 #5
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('output_dir', type=str)
    parser.add_argument('--deterministic', default=1, type=int)
    parser.add_argument('--max_steps', type=int, required=True)
    parser.add_argument('--env_name', type=str, default=None)
    args = parser.parse_args()

    util.mkdir_p(args.output_dir)
    assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir
    print 'Writing to', args.output_dir

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name'] if args.env_name is None else args.env_name
    print 'Loading environment', env_name
    mdp = rllabenv.RLLabMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    util.header('Max steps is {}'.format(args.max_steps))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    # Animate
    sim = mdp.new_sim()
    steps = 0
    exit = False
    while not exit:
        sim.reset()
        while not sim.done:
            a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
            sim.step(a)
            sim.draw()
            viewer = sim.env.viewer
            data, w, h = viewer.get_image()
            image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:]
            cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1])

            print steps
            steps += 1

            if steps >= args.max_steps:
                exit = True
                break
예제 #6
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int,
                        default=None)  # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    if args.eval_only:
        n = 50
        print 'Evaluating based on {} trajs'.format(n)

        if False:
            eval_trajbatch = mdp.sim_mp(
                policy_fn=lambda obs_B_Do: policy.sample_actions(
                    obs_B_Do, args.deterministic),
                obsfeat_fn=lambda obs: obs,
                cfg=policyopt.SimConfig(min_num_trajs=n,
                                        min_total_sa=-1,
                                        batch_size=None,
                                        max_traj_len=args.max_traj_len))
            returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1)
            avgr = eval_trajbatch.r.stacked.mean()
            lengths = np.array([len(traj) for traj in eval_trajbatch])
            ent = policy._compute_actiondist_entropy(
                eval_trajbatch.adist.stacked).mean()
            print 'ret: {} +/- {}'.format(returns.mean(), returns.std())
            print 'avgr: {}'.format(avgr)
            print 'len: {} +/- {}'.format(lengths.mean(), lengths.std())
            print 'ent: {}'.format(ent)
            print returns
        else:
            returns = []
            lengths = []
            sim = mdp.new_sim()

            for i_traj in xrange(n):
                iteration = 0
                sim.reset()
                totalr = 0.
                l = 0
                while not sim.done and iteration < args.max_traj_len:
                    a = policy.sample_actions(sim.obs[None, :],
                                              bool(
                                                  args.deterministic))[0][0, :]
                    r = sim.step(a)
                    totalr += r
                    l += 1
                    iteration += 1

                print i_traj, n, totalr, iteration
                returns.append(totalr)
                lengths.append(l)

            print 'Avg Return: ', np.array(returns).mean()
            print 'Std Return: ', np.array(returns).std()
        #import IPython; IPython.embed()

    elif args.out is not None:
        # Sample trajs and write to file
        print 'Saving traj samples to file: {}'.format(args.out)

        assert not os.path.exists(args.out)
        assert args.count > 0
        # Simulate to create a trajectory batch
        util.header('Sampling {} trajectories of maximum length {}'.format(
            args.count, args.max_traj_len))
        trajs = []
        for i in tqdm.trange(args.count):
            trajs.append(
                mdp.sim_single(
                    lambda obs: policy.sample_actions(obs, args.deterministic),
                    lambda obs: obs, args.max_traj_len))
        trajbatch = policyopt.TrajBatch.FromTrajs(trajs)

        print
        print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean()

        # Save the trajs to a file
        with h5py.File(args.out, 'w') as f:

            def write(name, a):
                # chunks of 128 trajs each
                f.create_dataset(name,
                                 data=a,
                                 chunks=(min(128, a.shape[0]), ) + a.shape[1:],
                                 compression='gzip',
                                 compression_opts=9)

            # Right-padded trajectory data
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B',
                  np.array([len(traj) for traj in trajbatch], dtype=np.int32))

            # Also save args to this script
            argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            f.attrs['args'] = argstr

    else:
        # Animate
        sim = mdp.new_sim()
        raw_obs, normalized_obs = [], []

        tret_list = []
        iteration = 0
        while iteration < 50:
            sim.reset()
            totalr = 0.
            steps = 0
            while not sim.done:
                raw_obs.append(sim.obs[None, :])
                normalized_obs.append(
                    policy.compute_internal_normalized_obsfeat(
                        sim.obs[None, :]))

                a = policy.sample_actions(sim.obs[None, :],
                                          args.deterministic)[0][0, :]
                r = sim.step(a)
                totalr += r
                steps += 1
                sim.draw()

                if steps % args.max_traj_len == 0:
                    tmpraw = np.concatenate(raw_obs, axis=0)
                    tmpnormed = np.concatenate(normalized_obs, axis=0)
                    print 'raw mean, raw std, normed mean, normed std'
                    print np.stack([
                        tmpraw.mean(0),
                        tmpraw.std(0),
                        tmpnormed.mean(0),
                        tmpnormed.std(0)
                    ])
                    break
            print 'Steps: %d, return: %.5f' % (steps, totalr)
            tret_list.append(totalr)
            iteration += 1

        print 'Avg Return: ', np.array(tret_list).mean()
        print 'Std Return: ', np.array(tret_list).std()
예제 #7
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--resume_training', action='store_true', help="Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf") 
    parser.add_argument('--checkpoint', type=str, help="Load from checkpoint if provided and if --resume_training") 
    parser.add_argument('--limit_trajs', type=int, required=True, help="How many expert trajectories to be used for training. If None : full dataset is used.") 
    parser.add_argument('--data_subsamp_freq', type=int, required=True, help="A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)")
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)
    print "\n\n========== Policy network specifications loaded ===========\n\n"

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    print "\n\n========== MDP initialized ===========\n\n"

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
    if args.resume_training:
        if args.checkpoint is not None:
            file, policy_key = util.split_h5_name(args.checkpoint)
            policy_file = file[:-3]+'_policy.h5'
            policy.load_h5(policy_file, policy_key)

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(),))

    print "\n\n========== Policy initialized ===========\n\n"

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    print "\n\n========== Expert data loaded ===========\n\n"

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None #There is no role of the reward function or value function in behavioral cloning
        opt = imitation.BehavioralCloningOptimizer(
            mdp, policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o:o,
            ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

        print "======= Behavioral Cloning optimizer initialized ======="

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier( #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
            #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
            if args.resume_training:
                if args.checkpoint is not None:
                    file, reward_key = util.split_h5_name(args.checkpoint)
                    reward_file = file[:-3]+'_reward.h5'
                    print reward_file
                    reward.load_h5(reward_file, reward_key)

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc( #Add resume training functionality
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1./mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')
        if args.resume_training:
            if args.checkpoint is not None:
                file, vf_key = util.split_h5_name(args.checkpoint)
                vf_file = file[:-3]+'_vf.h5'
                vf.load_h5(vf_file, vf_key)

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(
                min_num_trajs=-1, min_total_sa=args.min_total_sa,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da)
        if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

        print "======== Observation normalization done ========"

    # Run optimizer
    print "======== Optimization begins ========"

    # Trial: make checkpoints for policy, reward and vf
    policy_log = nn.TrainingLog(args.log[:-3]+'_policy.h5', [('args', argstr)])
    reward_log = nn.TrainingLog(args.log[:-3]+'_reward.h5', [('args', argstr)])
    vf_log = nn.TrainingLog(args.log[:-3]+'_vf.h5', [('args', argstr)])
    

    for i in xrange(args.max_iter):
        
        #Optimization step
        iter_info = opt.step() 

        #Log and plot
        #pdb.set_trace()
    	policy_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        reward_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        vf_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        

        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            policy_log.write_snapshot(policy, i)
            reward_log.write_snapshot(reward, i)
            vf_log.write_snapshot(vf, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1)
            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0,1
            range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max()))
            range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

            ax.legend()
            plt.show()
예제 #8
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_traj_len', type=int, default=None)
    parser.add_argument('--env_name', type=str, required=True)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--enable_obsnorm', type=int, default=1)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--use_tanh', type=int, default=0)
    # Optimizer
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    # Sampling
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=100000)
    # Saving stuff
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    if args.tiny_policy or args.use_tanh:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE

        if args.use_tanh:
            arch = json.loads(args.policy_hidden_spec)
            for layer in arch:
                if layer['type'] == 'nonlin':
                    layer['func'] = 'tanh'
            args.policy_hidden_spec = json.dumps(arch)
        print('Modified architecture:', args.policy_hidden_spec)

    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=bool(args.enable_obsnorm))
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=bool(
                                              args.enable_obsnorm))
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')

    util.header('Policy architecture')
    policy.print_trainable_variables()

    vf = rl.ValueFunc(hidden_spec=args.policy_hidden_spec,
                      obsfeat_space=mdp.obs_space,
                      enable_obsnorm=bool(args.enable_obsnorm),
                      enable_vnorm=True,
                      max_kl=args.vf_max_kl,
                      damping=args.vf_cg_damping,
                      time_scale=1. / mdp.env_spec.timestep_limit,
                      varscope_name='ValueFunc')

    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print('Max traj len:', max_traj_len)
    opt = rl.SamplingPolicyOptimizer(
        mdp=mdp,
        discount=args.discount,
        lam=args.lam,
        policy=policy,
        sim_cfg=SimConfig(min_num_trajs=-1,
                          min_total_sa=args.min_total_sa,
                          batch_size=args.sim_batch_size,
                          max_traj_len=max_traj_len),
        step_func=rl.TRPO(max_kl=args.policy_max_kl,
                          damping=args.policy_cg_damping),
        value_func=vf,
        obsfeat_fn=lambda obs: obs,
    )

    log = nn.TrainingLog(args.log, [('args', argstr)])

    for i in range(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info, print_header=i % 20 == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)
예제 #9
0
def main():
    """
    If we have trained policies and snapshots, I think we can use this to watch
    videos of our agent in action. I don't think I can use this without doing
    some training first. This doesn't do training itself; we need to provide a
    policy, but the h5 file has to also be a directory which contains other
    information (see the yaml files for what I believe are similar examples).

    I'm not sure why we have rl giving us Gaussian policies vs Gibbs policies.
    What's the difference? They should just be functions mapping from states to
    actions?

    After that, it seems like we're just simulating stuff and hopefully a video
    would appear if I can get this to run.
    """
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('output_dir', type=str)
    parser.add_argument('--deterministic', default=1, type=int)
    parser.add_argument('--max_steps', type=int, required=True)
    parser.add_argument('--env_name', type=str, default=None)
    args = parser.parse_args()

    util.mkdir_p(args.output_dir)
    assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir
    print 'Writing to', args.output_dir

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)

    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name'] if args.env_name is None else args.env_name
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    util.header('Max steps is {}'.format(args.max_steps))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    # Animate
    sim = mdp.new_sim()
    steps = 0
    exit = False
    while not exit:
        sim.reset()
        while not sim.done:
            a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
            sim.step(a)
            sim.draw()
            viewer = sim.env.viewer
            data, w, h = viewer.get_image()
            image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:]
            cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1])

            print steps
            steps += 1

            if steps >= args.max_steps:
                exit = True
                break
예제 #10
0
파일: imitate_mj.py 프로젝트: Santara/RAIL
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument(
        '--resume_training',
        action='store_true',
        help=
        "Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf"
    )
    parser.add_argument(
        '--checkpoint',
        type=str,
        help="Load from checkpoint if provided and if --resume_training")
    parser.add_argument(
        '--limit_trajs',
        type=int,
        required=True,
        help=
        "How many expert trajectories to be used for training. If None : full dataset is used."
    )
    parser.add_argument(
        '--data_subsamp_freq',
        type=int,
        required=True,
        help=
        "A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)"
    )
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)

    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping',
                        type=float,
                        default=.1,
                        help="TRPO parameter")
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl',
                        type=float,
                        default=.01,
                        help="TRPO parameter")
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)
    # CVaR parameters
    parser.add_argument('--useCVaR', action='store_true')
    parser.add_argument('--CVaR_alpha', type=float, default=0.9)
    parser.add_argument('--CVaR_beta', type=float, default=0.)
    parser.add_argument('--CVaR_lr', type=float, default=0.01)
    # !!! The following argument --disc_CVaR_weight is not of use and should be removed
    parser.add_argument(
        '--disc_CVaR_weight',
        type=float,
        default=1.,
        help=
        "Weight given to CVaR loss for the discriminator. Added by Anirban for smooth convergence."
    )
    parser.add_argument('--CVaR_Lambda_not_trainable', action='store_false')
    parser.add_argument('--CVaR_Lambda_val_if_not_trainable',
                        type=float,
                        default=0.5)
    #Filtering expert trajectories
    parser.add_argument('--use_expert_traj_filtering', action='store_true')
    parser.add_argument('--expert_traj_filt_percentile_threshold',
                        type=float,
                        default=20)
    # Additive state prior formulation
    parser.add_argument('--use_additiveStatePrior', action='store_true')
    parser.add_argument('--additiveStatePrior_weight', type=float, default=1.)
    parser.add_argument('--n_gmm_components', type=int, default=5)
    parser.add_argument('--cov_type_gmm', type=str, default='diag')
    parser.add_argument('--familiarity_alpha', type=float, default=10000000)
    parser.add_argument('--familiarity_beta', type=float, default=100)

    parser.add_argument('--kickThreshold_percentile',
                        type=float,
                        default=100.0)
    parser.add_argument('--appendFlag', action='store_true')

    args = parser.parse_args()

    if args.useCVaR:
        print ">>>>>>>>>>>>>>>>>>> TRAINING RAIL <<<<<<<<<<<<<<<<<<<"
    elif args.use_additiveStatePrior:
        print ">>>>>>>>>>>>>>>>>>> USING ADDITIVE STATE PRIOR <<<<<<<<<<<<<<<<<<<"
    else:
        print ">>>>>>>>> TRAINING GAIL <<<<<<<<<<"

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)
    print "\n\n========== Policy network specifications loaded ===========\n\n"

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    print "\n\n========== MDP initialized ===========\n\n"

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy', args.useCVaR)
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy', args.useCVaR)

    offset = 0
    #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
    if args.resume_training:
        if args.checkpoint is not None:
            file, policy_key = util.split_h5_name(args.checkpoint)
            offset = int(policy_key.split('/')[-1][4:])
            print '\n**************************************************'
            print 'Resuming from checkpoint : %d of %s' % (offset, file)
            print '**************************************************\n'

            if args.appendFlag and file != args.log:
                raise RuntimeError(
                    'Log file and checkpoint should have the same name if appendFlag is on. %s vs %s'
                    % file, args.log)

            policy_file = file[:-3] + '_policy.h5'  # Because we're naming the file as *_policy.h5 itself
            policy.load_h5(policy_file, policy_key)

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))

    print "\n\n========== Policy initialized ===========\n\n"

    # Load expert data

    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data,
        args.limit_trajs,
        args.data_subsamp_freq,
        len_filtering=args.use_expert_traj_filtering,
        len_filter_threshold=args.expert_traj_filt_percentile_threshold)

    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    print "\n\n========== Expert data loaded ===========\n\n"

    print '\n==================== Hyperparams ===================='
    print '\texpert_traj_filt_percentile_threshold = %f' % args.expert_traj_filt_percentile_threshold
    print '\tfamiliarity_alpha = %f' % args.familiarity_alpha
    print '\tfamiliarity_beta = %f' % args.familiarity_beta
    print '\tkickThreshold_percentile = %f' % args.kickThreshold_percentile
    print '==============================================\n'

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None  #There is no role of the reward function or value function in behavioral cloning
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

        print "======= Behavioral Cloning optimizer initialized ======="

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(  #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier',
                useCVaR=args.useCVaR,
                CVaR_loss_weightage=args.disc_CVaR_weight)
            #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
            if args.resume_training:
                if args.checkpoint is not None:
                    file, reward_key = util.split_h5_name(args.checkpoint)
                    reward_file = file[:-3] + '_reward.h5'
                    print reward_file
                    reward.load_h5(reward_file, reward_key)

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(
            args.no_vf) else rl.ValueFunc(  #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                enable_obsnorm=args.obsnorm_mode != 'none',
                enable_vnorm=True,
                max_kl=args.vf_max_kl,
                damping=args.vf_cg_damping,
                time_scale=1. / mdp.env_spec.timestep_limit,
                varscope_name='ValueFunc')
        if args.resume_training:
            if args.checkpoint is not None:
                file, vf_key = util.split_h5_name(args.checkpoint)
                vf_file = file[:-3] + '_vf.h5'
                vf.load_h5(vf_file, vf_key)
        if args.useCVaR:
            opt = imitation.ImitationOptimizer_CVaR(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=True),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked,
                #For CVaR
                CVaR_alpha=args.CVaR_alpha,
                CVaR_beta=args.CVaR_beta,
                CVaR_lr=args.CVaR_lr,
                CVaR_Lambda_trainable=args.CVaR_Lambda_not_trainable,
                CVaR_Lambda_val_if_not_trainable=args.
                CVaR_Lambda_val_if_not_trainable,
                offset=offset + 1)
        elif args.use_additiveStatePrior:
            opt = imitation.ImitationOptimizer_additiveStatePrior(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=False),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked,
                n_gmm_components=args.n_gmm_components,
                cov_type_gmm=args.cov_type_gmm,
                additiveStatePrior_weight=args.additiveStatePrior_weight,
                alpha=args.familiarity_alpha,
                beta=args.familiarity_beta,
                kickThreshold_percentile=args.kickThreshold_percentile,
                offset=offset + 1)
        else:
            opt = imitation.ImitationOptimizer(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=False),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None:
            reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do),
                                    exa_Bstacked_Da)
        if vf is not None:
            vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

        print "======== Observation normalization done ========"

    # Run optimizer
    print "======== Optimization begins ========"

    # Trial: make checkpoints for policy, reward and vf
    policy_log = nn.TrainingLog(args.log[:-3] + '_policy.h5',
                                [('args', argstr)], args.appendFlag)
    reward_log = nn.TrainingLog(args.log[:-3] + '_reward.h5',
                                [('args', argstr)], args.appendFlag)
    vf_log = nn.TrainingLog(args.log[:-3] + '_vf.h5', [('args', argstr)],
                            args.appendFlag)

    kickStatesData = []

    print '\n**************************************'
    print 'Running iterations from %d to %d' % (offset + 1, args.max_iter)

    for i in xrange(offset + 1, args.max_iter):
        # for i in range(1): #FIXME: this is just for studying the insides of the training algo

        # All training a.k.a. optimization happens in the next line!!! -_-
        # pdb.set_trace()
        iter_info = opt.step(
            i, kickStatesData) if args.use_additiveStatePrior else opt.step(i)

        #========= The rest is fluff =============

        #Log and plot
        #pdb.set_trace()
        policy_log.write(
            iter_info,
            print_header=i % (20 * args.print_freq) == 0,
            # display=False
            display=i % args.print_freq == 0  ## FIXME: AS remove comment
        )
        # reward_log.write(iter_info,
        #         print_header=i % (20*args.print_freq) == 0,
        #         display=False
        #         # display=i % args.print_freq == 0 ## FIXME: AS remove comment
        #         )
        # vf_log.write(iter_info,
        #         print_header=i % (20*args.print_freq) == 0,
        #         display=False
        #         # display=i % args.print_freq == 0 ## FIXME: AS remove comment
        #         )

        #FIXME: problem running this on 211 and 138. No problem on 151
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            policy_log.write_snapshot(policy, i)
            reward_log.write_snapshot(reward, i)
            vf_log.write_snapshot(vf, i)

            # analysisFile=open(args.log[:-3]+'_kickedStates' + str(i) + '.pkl', 'wb')
            analysisFile = open(args.log[:-3] + '_kickedStates.pkl', 'wb')
            pkl.dump({'kickStatesData': kickStatesData},
                     analysisFile,
                     protocol=2)
            analysisFile.close()

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:, idx1],
                       exdata_N_Doa[:, idx2],
                       color='blue',
                       s=1,
                       label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:, idx1],
                       pdata_M_Doa[:, idx2],
                       color='red',
                       s=1,
                       label='apprentice')

            ax.legend()
            plt.show()
예제 #11
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--limit_trajs', type=int, required=True)
    parser.add_argument('--data_subsamp_freq', type=int, required=True)
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')

    # add a spec for transition classifier
    parser.add_argument('--clf_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)

    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=100)
    parser.add_argument('--log', type=str, required=False)

    # Sequential model
    parser.add_argument('--seq_model', type=int, default=0)
    parser.add_argument('--time_step', type=int, default=10)

    args = parser.parse_args()

    # Initialize the MDP
    if not args.seq_model:
        if args.tiny_policy:
            assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
            args.policy_hidden_spec = TINY_ARCHITECTURE
        argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
        print(argstr)
    # Add sequential model
    else:
        if args.tiny_policy:
            assert args.policy_hidden_spec == SEQ_SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
            args.policy_hidden_spec = SEQ_TINY_ARCHITECTURE
#        # change the default architecture to fit sequential model
#        if args.policy_hidden_spec == SIMPLE_ARCHITECTURE:
#            args.policy_hidden_spec = SEQ_SIMPLE_ARCHITECTURE
        if args.clf_hidden_spec == SIMPLE_ARCHITECTURE:
            args.clf_hidden_spec = SEQ_SIMPLE_ARCHITECTURE
        argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'

    if not args.seq_model:
        if isinstance(mdp.action_space, policyopt.ContinuousSpace):
            policy_cfg = rl.GaussianPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                min_stdev=0.,
                init_logstdev=0.,
                enable_obsnorm=enable_obsnorm)
            policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space,
                                       mdp.action_space, 'GaussianPolicy')
        else:
            policy_cfg = rl.GibbsPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                enable_obsnorm=enable_obsnorm)
            policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space,
                                    mdp.action_space, 'GibbsPolicy')
    # Add squential model
    else:
        if isinstance(mdp.action_space, policyopt.ContinuousSpace):
            policy_cfg = rl.SeqGaussianPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                time_step=args.time_step,  # add time step
                min_stdev=0.,
                init_logstdev=0.,
                enable_obsnorm=enable_obsnorm,
                enable_actnorm=False)  # XXX not implement actnorm yet
            policy = rl.SeqGaussianPolicy(policy_cfg, mdp.obs_space,
                                          mdp.action_space,
                                          'SeqGaussianPolicy')
        else:
            policy_cfg = rl.SeqGibbsPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                time_step=args.time_step,  # add time step
                enable_obsnorm=enable_obsnorm,
                enable_actnorm=False)  # XXX not implement actnorm yet
            policy = rl.SeqGibbsPolicy(policy_cfg, mdp.obs_space,
                                       mdp.action_space, 'SeqGibbsPolicy')

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    #    print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype
    #    print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype
    #    print 'Debug: ext_Bstacked dtype:', ext_Bstacked.dtype

    #    assert 1 == 0

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print('Max traj len:', max_traj_len)

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        #        args.print_freq = args.bclone_eval_freq
        #        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len,
                smp_traj_len=-1),
            eval_freq=args.
            bclone_eval_freq,  # XXX set a value when using bclone
            train_frac=args.bclone_train_frac)

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len,
                                        smp_traj_len=-1),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping,
                              sequential_model=False),  # add sequential model
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Add Sequential Model
    elif args.mode == 'sga':
        if args.reward_type == 'nn':
            reward = imitation.SequentialTransitionClassifier(
                hidden_spec=args.clf_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                time_step=args.time_step,  # add time step
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='SequentialTransitionClassifier')
#        elif args.reward_type in ['l2ball', 'simplex']:
#            reward = imitation.LinearReward(
#                obsfeat_space=mdp.obs_space,
#                action_space=mdp.action_space,
#                mode=args.reward_type,
#                enable_inputnorm=True,
#                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
#                include_time=bool(args.reward_include_time),
#                time_scale=1./mdp.env_spec.timestep_limit,
#                exobs_Bex_Do=exobs_Bstacked_Do,
#                exa_Bex_Da=exa_Bstacked_Da,
#                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.SequentialValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            time_step=args.time_step,  # add time step
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='SequentialValueFunc')

        opt = imitation.SequentialImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SeqSimConfig(
                min_num_trajs=-1,
                min_total_sa=args.min_total_sa,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len,
                time_step=args.time_step),  # add time step
            step_func=rl.TRPO(
                max_kl=args.policy_max_kl,
                damping=args.policy_cg_damping,
                sequential_model=False),  # XXX not use sequential trpo
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        if not args.seq_model:
            policy.update_obsnorm(exobs_Bstacked_Do)
            if reward is not None:
                reward.update_inputnorm(
                    opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da)
            if vf is not None:
                vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))
        # Add sequential model
        else:
            Bstacked, Do, T = exobs_Bstacked_Do.shape[
                0], exobs_Bstacked_Do.shape[1], args.time_step
            exobs_BT_Do = exobs_Bstacked_Do[:T * (Bstacked // T), :]
            exa_BT_Da = exa_Bstacked_Da[:T * (Bstacked // T), :]
            # reshape:(B*T, ...) => (B, T, ...)
            exobs_B_T_Do = np.reshape(
                exobs_BT_Do, (Bstacked // T, T, exobs_Bstacked_Do.shape[1]))
            exa_B_T_Da = np.reshape(
                exa_BT_Da, (Bstacked // T, T, exa_Bstacked_Da.shape[1]))
            print("Debug: exobs_Bstacked_Do:", exobs_Bstacked_Do.shape[0],
                  exobs_Bstacked_Do.shape[1])
            print("Debug: exobs_B_T_Do:", exobs_B_T_Do.shape[0],
                  exobs_B_T_Do.shape[1], exobs_B_T_Do.shape[2])
            # XXX use original policy (not sequential)
            policy.update_obsnorm(exobs_Bstacked_Do)
            if reward is not None:
                reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_B_T_Do),
                                        exa_B_T_Da)
            if vf is not None:
                vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

    # Run optimizer

#    log = nn.TrainingLog(args.log, [('args', argstr)])
    log = nn.BasicTrainingLog(args.log, [('args', argstr)])
    for i in xrange(args.max_iter):
        iter_info = opt.step()
        #        log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0)
        log.add_log(iter_info,
                    print_header=i % (20 * args.print_freq) == 0,
                    display=i % args.print_freq == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            print('%i/%i iters is done. Save snapshot.' % (i, args.max_iter))
            #            log.write_snapshot(policy, i)
            log.write_snapshot(policy, i)

        if args.mode == 'ga' and args.plot_freq != 0 and i % args.plot_freq == 0:
            print('%i/%i iters is done. Save plot.' % (i, args.max_iter))
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)
            # convert dtype to follow theano config
            exdata_N_Doa = exdata_N_Doa.astype(theano.config.floatX)
            pdata_M_Doa = pdata_M_Doa.astype(theano.config.floatX)
            #            print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype    # float32
            #            print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype    # int64
            #            print 'Debug: opt.last_sampbatch.obs.stacked dtype:', opt.last_sampbatch.obs.stacked.dtype    # float32
            #            print 'Debug: opt.last_sampbatch.a.stacked dtype:', opt.last_sampbatch.a.stacked.dtype    # int64
            #            print 'Debug: exdata_N_Doa dtype:', exdata_N_Doa.dtype    # float32
            #            print 'Debug: pdata_M_Doa dtype:', pdata_M_Doa.dtype    # float32

            # Plot reward
            #            import matplotlib
            #            matplotlib.use('Agg')
            #            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))

            #            print 'Debug: range1 types:', type(range1[0]), type(range1[1])    # float32, float32
            #            print 'Debug: range2 types:', type(range2[0]), type(range2[1])    # float32, float32

            x, y, z = reward.plot(ax, idx1, idx2, range1, range2, n=100)
            plot = [
                x, y, z, exdata_N_Doa[:, idx1], exdata_N_Doa[:, idx2],
                pdata_M_Doa[:, idx1], pdata_M_Doa[:, idx2]
            ]
            log.write_plot(plot, i)

            # Plot expert data


#            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

# Plot policy samples
#            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

#            ax.legend()
#            plt.show()
#            plt.savefig()
#            plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]]
#            log.write_plot(plot, i)

#        if args.mode == 'sga' and args.plot_freq != 0 and i % args.plot_freq == 0:
#            print ('%i/%i iters is done. Save plot.' %(i, args.max_iter))
#            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1)
#            # reshape: (B, T, ...) => (B*T, ...)
##            B, T, Df = opt.last_sampbatch.obs.stacked.shape
##            obs_flatten = np.reshape(opt.last_sampbatch.obs.stacked, (B*T, opt.last_sampbatch.obs.stacked.shape[2]))
##            a_flatten = np.reshape(opt.last_sampbatch.a.stacked, (B*T, opt.last_sampbatch.a.stacked.shape[2]))
###            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)
#            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)
#            # convert dtype to follow theano config
#            exdata_N_Doa = exdata_N_Doa.astype(theano.config.floatX)
#            pdata_M_Doa = pdata_M_Doa.astype(theano.config.floatX)
##            print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype    # float32
##            print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype    # int64
##            print 'Debug: opt.last_sampbatch.obs.stacked dtype:', opt.last_sampbatch.obs.stacked.dtype    # float32
##            print 'Debug: opt.last_sampbatch.a.stacked dtype:', opt.last_sampbatch.a.stacked.dtype    # int64
##            print 'Debug: exdata_N_Doa dtype:', exdata_N_Doa.dtype    # float32
##            print 'Debug: pdata_M_Doa dtype:', pdata_M_Doa.dtype    # float32

#            # Plot reward
##            import matplotlib
##            matplotlib.use('Agg')
##            import matplotlib.pyplot as plt
#            _, ax = plt.subplots()
#            idx1, idx2 = 0,1
#            range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max()))
#            range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max()))

##            print 'Debug: range1 types:', type(range1[0]), type(range1[1])    # float32, float32
##            print 'Debug: range2 types:', type(range2[0]), type(range2[1])    # float32, float32

#           # for sequential model, input the length of sequence
#           # XXX take care of the usage of memory !!
#           x, y, z = reward.plot(ax, idx1, idx2, range1, range2, args.time_step, n=100)
#           plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]]
#           log.write_plot(plot, i)

#            # Plot expert data
##            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

#            # Plot policy samples
##            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

##            ax.legend()
##            plt.show()
##            plt.savefig()
##            plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]]
##            log.write_plot(plot, i)

# write log
    print('Training is done. Save log.')
    log.write_log()
    log.close()
예제 #12
0
def main():
    """ 
    NOTE! Don't forget that these are effectively called directly from the yaml
    files. They call imitate_mj.py with their own arguments, so check there if
    some of the values differ from the default ones.
    """
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--limit_trajs', type=int, required=True)
    parser.add_argument('--data_subsamp_freq', type=int, required=True)
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')
    # Behavioral cloning optimizer (ok ... 128 and 0.7 settings are in the paper).
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    print("\n\tNow initializing the policy:")
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))
    print("\tFinished initializing the policy.\n")

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            # FYI: this is the GAIL case. Note that it doesn't take in any of
            # the raw expert data, unlike the other reward types. And we call
            # them `reward types` since the optimize can use their output in
            # some way to impove itself.
            reward = imitation.TransitionClassifier(
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
        elif args.reward_type in ['l2ball', 'simplex']:
            # FEM or game-theoretic apprenticeship learning, respectively.
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        # All three of these 'advanced' IL algorithms use neural network value
        # functions to reduce variance for policy gradient estimates.
        print("\n\tThe **VALUE** function (may have action concatenated):")
        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None:
            reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do),
                                    exa_Bstacked_Da)
        if vf is not None:
            vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

    # Run optimizer, i.e. {BehavioralCloning,Imitation}Optimizer.
    log = nn.TrainingLog(args.log, [('args', argstr)])
    for i in xrange(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info,
                  print_header=i % (20 * args.print_freq) == 0,
                  display=i % args.print_freq == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:, idx1],
                       exdata_N_Doa[:, idx2],
                       color='blue',
                       s=1,
                       label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:, idx1],
                       pdata_M_Doa[:, idx2],
                       color='red',
                       s=1,
                       label='apprentice')

            ax.legend()
            plt.show()
예제 #13
0
파일: evaluation.py 프로젝트: xairc/gmmil
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int,
                        default=None)  # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    #filenames = os.listdir(args.policy)
    csvf = open(args.policy[:-3] + '.csv', 'w')
    csvwriter = csv.writer(csvf)

    dataf = open(args.policy[:-3] + 'full.csv', 'w')
    datawriter = csv.writer(dataf)
    #csvwriter.writerow(['filename', 'average', 'std'])

    # Load the saved state
    if args.policy.find('reacher') > 0:
        key_iter = 200
    elif args.policy.find('humanoid') > 0:
        key_iter = 1500
    else:
        key_iter = 500

    policy_file, policy_key = util.split_h5_name(args.policy +
                                                 '/snapshots/iter%07d' %
                                                 key_iter)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    if args.policy.find('shared1') > 0:
        sharednet = True
    else:
        sharednet = False

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters

    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg,
                                   mdp.obs_space,
                                   mdp.action_space,
                                   'GaussianPolicy',
                                   use_shared_std_network=sharednet)
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg,
                                mdp.obs_space,
                                mdp.action_space,
                                'GibbsPolicy',
                                use_shared_std_network=sharednet)
    policy.load_h5(policy_file, policy_key)

    n = 50
    print 'Evaluating based on {} trajs'.format(n)

    returns = []
    lengths = []
    sim = mdp.new_sim()

    for i_traj in xrange(n):
        iteration = 0
        sim.reset()
        totalr = 0.
        l = 0
        while not sim.done and iteration < args.max_traj_len:
            a = policy.sample_actions(sim.obs[None, :],
                                      bool(args.deterministic))[0][0, :]
            r = sim.step(a)
            totalr += r
            l += 1
            iteration += 1

        print i_traj, n, totalr, iteration
        datawriter.writerow([i_traj, n, totalr, iteration])
        returns.append(totalr)
        lengths.append(l)
    avg, std = np.array(returns).mean(), np.array(returns).std()
    print 'Avg Return: ', avg, 'Std: ', std
    csvwriter.writerow([args.policy, avg, std])
    del policy
    #import IPython; IPython.embed()

    csvf.close()
    dataf.close()
예제 #14
0
파일: imitate_mj.py 프로젝트: xairc/gmmil
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    parser.add_argument('--seed', type=int, default=0)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--limit_trajs', type=int, required=True)
    parser.add_argument('--data_subsamp_freq', type=int, required=True)
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    parser.add_argument('--use_shared_std_network', type=int, default=0)
    # Generative Moment matching
    parser.add_argument('--kernel_batchsize', type=int, default=1000)
    parser.add_argument('--kernel_reg_weight', type=float, default=0.)
    parser.add_argument('--use_median_heuristic', type=int, default=1)
    parser.add_argument('--use_logscale_reward', type=int)
    parser.add_argument('--reward_epsilon', type=float, default=0.0001)
    # Auto-Encoder Information
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)
    parser.add_argument('--save_reward', type=int, default=0)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy',
                                   bool(args.use_shared_std_network))
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space,
                                mdp.action_space, 'GibbsPolicy',
                                bool(args.use_shared_std_network))

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq, args.seed)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    elif args.mode == 'gmmil':
        if args.use_median_heuristic == 0:
            bandwidth_params = [
                1.0, 1.0 / 2.0, 1.0 / 5.0, 1.0 / 10.0, 1.0 / 40.0, 1.0 / 80.0
            ]
        else:
            bandwidth_params = []

        if args.reward_type == 'mmd':
            reward = gmmil.MMDReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked,
                kernel_bandwidth_params=bandwidth_params,
                kernel_reg_weight=args.kernel_reg_weight,
                kernel_batchsize=args.kernel_batchsize,
                use_median_heuristic=args.use_median_heuristic,
                use_logscale_reward=bool(args.use_logscale_reward),
                save_reward=bool(args.save_reward),
                epsilon=args.reward_epsilon)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None:
            reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do),
                                    exa_Bstacked_Da)
        if vf is not None:
            vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

    # Run optimizer
    log = nn.TrainingLog(args.log, [('args', argstr)])
    for i in xrange(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info,
                  print_header=i % (20 * args.print_freq) == 0,
                  display=i % args.print_freq == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:, idx1],
                       exdata_N_Doa[:, idx2],
                       color='blue',
                       s=1,
                       label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:, idx1],
                       pdata_M_Doa[:, idx2],
                       color='red',
                       s=1,
                       label='apprentice')

            ax.legend()
            plt.show()