def __init__(self, input_B_Di, input_shape, output_shape, initializer): assert len(input_shape) == len(output_shape) == 1 util.header('Affine(in=%d, out=%d)' % (input_shape[0], output_shape[0])) self._output_shape = (output_shape[0], ) with variable_scope(type(self).__name__) as self.__varscope: if initializer is None: # initializer = np.random.randn(input_shape[0], output_shape[0]) * np.sqrt(2./input_shape[0]) # Glorot/Bengio 2010 s = np.sqrt(6. / (input_shape[0] + output_shape[0])) initializer = np.random.uniform(low=-s, high=s, size=(input_shape[0], output_shape[0])) else: assert initializer.shape == (input_shape[0], output_shape[0]) self.W_Di_Do = get_variable( 'W', initializer.astype(theano.config.floatX)) self.b_1_Do = get_variable('b', np.zeros((1, output_shape[0]), dtype=theano.config.floatX), broadcastable=(True, False)) self._output_B_Do = input_B_Di.dot(self.W_Di_Do) + self.b_1_Do
def find_deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories(expert_trajectories, learner_policy, limit_trajs, data_subsamp_freq, ipython_after_eval): # Load the learner's policy policy_file, policy_key = util.split_h5_name(learner_policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Load the expert trajectories exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = imitate_mj.load_dataset( expert_trajectories, limit_trajs, data_subsamp_freq) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 # Generate the actions according to the learner's policy for the expert's observations learner_actions_Bstacked_Da = policy.sample_actions(exobs_Bstacked_Do)[0] # Calcualating the deviation histogram: action_deviations = np.linalg.norm(exa_Bstacked_Da - learner_actions_Bstacked_Da, axis=1) # Plot the histogram # sns.kdeplot(action_deviations,shade=True) # FIXME: Uncomment the following plt.figure() plt.hist(action_deviations, bins=100) plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories.png') plt.show() if ipython_after_eval: import IPython; IPython.embed()
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('env', type=str) parser.add_argument('--num_eval_trajs', type=int, default=50) parser.add_argument('--max_traj_len', type=int, default=None) parser.add_argument('--out', type=str, default=None) args = parser.parse_args() # Initialize the mdp mdp = rlgymenv.RLGymMDP(args.env) env = gym.make(args.env) print "Initialized environment %s" % args.env util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if args.max_traj_len is None: args.max_traj_len = mdp.env_spec.timestep_limit util.header('Max traj len is {}'.format(args.max_traj_len)) # Run the simulation returns = [] lengths = [] sim = mdp.new_sim() for i_traj in range(args.num_eval_trajs): print i_traj, args.num_eval_trajs sim.reset() totalr = 0. l = 0 while not sim.done and l < args.max_traj_len: #a = [np.random.uniform(mdp.action_space.low[i], mdp.action_space.high[i]) for i in range(len(mdp.action_space.shape[0]))] a = env.action_space.sample() if isinstance(mdp.action_space, policyopt.FiniteSpace): a = np.asarray([a]) r = sim.step(a) totalr += r l += 1 returns.append(totalr) lengths.append(l) print "Mean reward: {}, Std reward: {}, Mean length: {}, Std length: {}\n".format( np.asarray(returns).mean(), np.asarray(returns).std(), np.asarray(lengths).mean(), np.asarray(lengths).std()) if args.out is not None: with open(args.out, 'w') as f: f.write( "Mean reward: {}, Std reward: {}, Mean length: {}, Std length: {}\n" .format( np.asarray(returns).mean(), np.asarray(returns).std(), np.asarray(lengths).mean(), np.asarray(lengths).std())) f.close()
def __init__(self, input_B_Di, output_shape, func): util.header('Nonlinearity(func=%s)' % func) self._output_shape = output_shape with variable_scope(type(self).__name__) as self.__varscope: self._output_B_Do = { 'relu': tensor.nnet.relu, 'lrelu': lambda x: tensor.nnet.relu(x, .01), 'elu': tensor.nnet.elu, 'tanh': tensor.tanh, }[func](input_B_Di)
def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic): policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx) trajbatch, _, _ = exec_saved_policy( env_name, policystr, num_trajs, deterministic=deterministic, max_traj_len=None) returns = trajbatch.r.padded(fill=0.).sum(axis=1) lengths = np.array([len(traj) for traj in trajbatch]) util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std())) return returns, lengths
def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic): policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx) trajbatch, _, _ = exec_saved_policy( env_name, policystr, num_trajs, deterministic=deterministic, max_traj_len=None) returns = trajbatch.r.padded(fill=0.).sum(axis=1) lengths = np.array([len(traj) for traj in trajbatch]) util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std())) return returns, lengths
def phase1_train(spec, specfilename): util.header('=== Phase 1: training ===') # Generate array job that trains all algorithms # over all tasks, for all dataset sizes (3 loops) taskname2dset = gen_taskname2outfile(spec) # Make checkpoint dir. All outputs go here checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir']) util.mkdir_p(checkptdir) # Make sure checkpoint dir is empty assert not os.listdir(checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir) # Assemble the commands to run on the cluster cmd_templates, outputfilenames, argdicts = [], [], [] for alg in spec['training']['algorithms']: for task in spec['tasks']: for num_trajs in spec['training']['dataset_num_trajs']: assert num_trajs <= spec['training']['full_dataset_num_trajs'] for run in range(spec['training']['runs']): # A string identifier. Used in filenames for this run strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run) cmd_templates.append(alg['cmd'].replace('\n', ' ').strip()) outputfilenames.append(strid + '.txt') argdicts.append({ 'env': task['env'], 'dataset': taskname2dset[task['name']], 'num_trajs': num_trajs, 'cuts_off_on_success': int(task['cuts_off_on_success']), 'data_subsamp_freq': task['data_subsamp_freq'], 'out': os.path.join(checkptdir, strid + '.h5'), }) pbsopts = spec['options']['pbs'] runpbs( cmd_templates, outputfilenames, argdicts, jobname=pbsopts['jobname'], queue=pbsopts['queue'], nodes=1, ppn=pbsopts['ppn'], job_range=pbsopts['range'] if 'range' in pbsopts else None, qsub_script_copy=os.path.join(checkptdir, 'qsub_script.sh') ) # Copy the pipeline yaml file to the output dir too shutil.copyfile(specfilename, os.path.join(checkptdir, 'pipeline.yaml')) # Keep git commit import subprocess git_hash = subprocess.check_output('git rev-parse HEAD', shell=True).strip() with open(os.path.join(checkptdir, 'git_hash.txt'), 'w') as f: f.write(git_hash + '\n')
def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic): """ Called during evaluation stage, prints results on screen and returns data which we save in a results `.h5` file. """ policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx) trajbatch, _, _ = exec_saved_policy( env_name, policystr, num_trajs, deterministic=deterministic, max_traj_len=None) returns = trajbatch.r.padded(fill=0.).sum(axis=1) lengths = np.array([len(traj) for traj in trajbatch]) util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std())) return returns, lengths
def find_deviation_of_agent_actions_from_expert_actions_for_underperforming_trajectories(learner_trajectories, expert_policy, lower_bound_reward, ipython_after_eval, generate_plot): obs,a,r,l = find_underperforming_trajectories(learner_trajectories, lower_bound_reward) print(type(obs)) # Load the expert's policy policy_file, policy_key = util.split_h5_name(expert_policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Generate the actions according to the expert's policy for the observations in the underperforming trajs expert_actions = policy.sample_actions(obs.reshape((-1,obs.shape[-1])))[0].reshape((-1,a.shape[1],a.shape[2])) # Calcualating the deviation histogram: action_deviations = np.linalg.norm(expert_actions.reshape((-1,a.shape[-1])) - a.reshape((-1,a.shape[-1])), axis=1) if generate_plot: plt.figure() plt.hist(action_deviations, bins=100) plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_underperforming_learner_trajectories.png') plt.show() if ipython_after_eval: import IPython; IPython.embed()
def phase1_train(spec, specfilename): """ In the normal code, this rounds up a long list of commands of the form `python (script name) (arguments)` which can be run on a cluster. It's really cool how this works. The `cmd_templates` list turns into a bunch of python script calls, except it has string formatting to allow the arguments to fill them in. A much better way than writing a long bash script! (Actually, to *get* a bash script, just write these one by one to a file and then I think running the file is OK.) I modified this to run sequentially. """ util.header('=== Phase 1: training ===') # Generate array job that trains (1) all algorithms over (2) all tasks, for # (3) all dataset sizes, so yes it's three loops. taskname2dset = gen_taskname2outfile(spec) # Make checkpoint dir. All outputs go here checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir']) util.mkdir_p(checkptdir) # Make sure checkpoint dir is empty assert not os.listdir(checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir) # Assemble the commands to run on the cluster cmd_templates, outputfilenames, argdicts = [], [], [] for alg in spec['training']['algorithms']: for task in spec['tasks']: for num_trajs in spec['training']['dataset_num_trajs']: assert num_trajs <= spec['training']['full_dataset_num_trajs'] for run in range(spec['training']['runs']): # A string identifier. Used in filenames for this run strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run) cmd_templates.append(alg['cmd'].replace('\n', ' ').strip()) outputfilenames.append(strid + '.txt') argdicts.append({ 'env': task['env'], 'dataset': taskname2dset[task['name']], 'num_trajs': num_trajs, 'cuts_off_on_success': int(task['cuts_off_on_success']), 'data_subsamp_freq': task['data_subsamp_freq'], 'out': os.path.join(checkptdir, strid + '.h5'), }) # (New code from Daniel) Put commands in a list and run them sequentially. all_commands = [x.format(**y) for (x,y) in zip(cmd_templates,argdicts)] print("Total number of commands to run: {}.".format(len(all_commands))) for command in all_commands: subprocess.call(command.split(" "))
def __init__(self, input_B_Di, input_shape, layerspec_json): ''' Args: layerspec (string): JSON string describing layers ''' assert len(input_shape) >= 1 self.input_B_Di = input_B_Di layerspec = json.loads(layerspec_json) util.header('Loading feedforward net specification') print(json.dumps(layerspec, indent=2, separators=(',', ': '))) self.layers = [] with variable_scope(type(self).__name__) as self.__varscope: prev_output, prev_output_shape = input_B_Di, input_shape for i_layer, ls in enumerate(layerspec): with variable_scope('layer_%d' % i_layer): if ls['type'] == 'reshape': _check_keys(ls, ['type', 'new_shape'], []) self.layers.append( ReshapeLayer(prev_output, ls['new_shape'])) elif ls['type'] == 'fc': _check_keys(ls, ['type', 'n'], ['initializer']) self.layers.append( AffineLayer(prev_output, prev_output_shape, output_shape=(ls['n'], ), initializer=_parse_initializer(ls))) elif ls['type'] == 'nonlin': _check_keys(ls, ['type', 'func'], []) self.layers.append( NonlinearityLayer(prev_output, prev_output_shape, ls['func'])) else: raise NotImplementedError('Unknown layer type %s' % ls['type']) prev_output, prev_output_shape = self.layers[ -1].output, self.layers[-1].output_shape self._output, self._output_shape = prev_output, prev_output_shape
def phase1_train(spec, specfilename): util.header('=== Phase 1: training ===') # Generate array job that trains all algorithms # over all tasks, for all dataset sizes (3 loops) taskname2dset = gen_taskname2outfile(spec) # Make checkpoint dir. All outputs go here checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir']) util.mkdir_p(checkptdir) # Make sure checkpoint dir is empty assert not os.listdir(checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir) # Assemble the commands to run on the cluster cmd_templates, outputfilenames, argdicts = [], [], [] for alg in spec['training']['algorithms']: for task in spec['tasks']: for num_trajs in spec['training']['dataset_num_trajs']: assert num_trajs <= spec['training']['full_dataset_num_trajs'] for run in range(spec['training']['runs']): # A string identifier. Used in filenames for this run strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run) cmd_templates.append(alg['cmd'].replace('\n', ' ').strip()) outputfilenames.append(strid + '.txt') argdicts.append({ 'env': task['env'], 'dataset': taskname2dset[task['name']], 'num_trajs': num_trajs, 'cuts_off_on_success': int(task['cuts_off_on_success']), 'data_subsamp_freq': task['data_subsamp_freq'], 'out': os.path.join(checkptdir, strid + '.h5'), }) for x, y in zip(cmd_templates, argdicts): subprocess.call (x.format(**y).split(" ")) # Copy the pipeline yaml file to the output dir too shutil.copyfile(specfilename, os.path.join(checkptdir, 'pipeline.yaml')) # Keep git commit git_hash = subprocess.check_output('git rev-parse HEAD', shell=True).strip() with open(os.path.join(checkptdir, 'git_hash.txt'), 'w') as f: f.write(git_hash + '\n')
def phase0_sampletrajs(spec, specfilename): """ The first phase, sampling expert trajectories from TRPO. This *can* be done sequentially on one computer, no need to worry. This *will* save the .h5 files according to `storagedir` in the specs, so manually remove if needed. This will sample `full_dataset_num_trajs` expert trajectories. I think it might be better to have that value be perhaps 50, since then I can use those values directly when plotting the expert performance alongside the algorithms, to be consistent in getting 50 samples. Just note that sampling more than 10 trajectories (or whatever our limit is) will **not** change the actual dataset, i.e. if we need 10 out of 20 trajectories, the `load_datasets` method will always load the first 10, and not randomly pick 10 out of the 20. """ util.header('=== Phase 0: Sampling trajs from expert policies ===') num_trajs = spec['training']['full_dataset_num_trajs'] util.header('Sampling {} trajectories'.format(num_trajs)) # Make filenames and check if they're valid first taskname2outfile = gen_taskname2outfile(spec, assert_not_exists=True) # Sample trajs for each task for task in spec['tasks']: # Execute the policy trajbatch, policy, _ = exec_saved_policy( task['env'], task['policy'], num_trajs, deterministic=spec['training']['deterministic_expert'], max_traj_len=None) # Quick evaluation returns = trajbatch.r.padded(fill=0.).sum(axis=1) avgr = trajbatch.r.stacked.mean() lengths = np.array([len(traj) for traj in trajbatch]) ent = policy._compute_actiondist_entropy(trajbatch.adist.stacked).mean() print 'returns.shape: {}'.format(returns.shape) print 'ret: {} +/- {}'.format(returns.mean(), returns.std()) print 'avgr: {}'.format(avgr) print 'len: {} +/- {}'.format(lengths.mean(), lengths.std()) print 'ent: {}'.format(ent) # Save the trajs to a file. Pad in case uneven lengths, but typically # the experts last the full duration so the lengths will be equivalent. with h5py.File(taskname2outfile[task['name']], 'w') as f: def write(dsetname, a): f.create_dataset(dsetname, data=a, compression='gzip', compression_opts=9) # Right-padded trajectory data using custom RaggedArray class. write('obs_B_T_Do', trajbatch.obs.padded(fill=0.)) write('a_B_T_Da', trajbatch.a.padded(fill=0.)) write('r_B_T', trajbatch.r.padded(fill=0.)) # Trajectory lengths write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32)) # # Also save args to this script # argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) # f.attrs['args'] = argstr util.header('Wrote {}'.format(taskname2outfile[task['name']]))
def phase0_sampletrajs(spec, specfilename): util.header('=== Phase 0: Sampling trajs from expert policies ===') num_trajs = spec['training']['full_dataset_num_trajs'] util.header('Sampling {} trajectories'.format(num_trajs)) # Make filenames and check if they're valid first taskname2outfile = gen_taskname2outfile(spec, assert_not_exists=True) # Sample trajs for each task for task in spec['tasks']: # Execute the policy trajbatch, policy, _ = exec_saved_policy( task['env'], task['policy'], num_trajs, deterministic=spec['training']['deterministic_expert'], max_traj_len=None) # Quick evaluation returns = trajbatch.r.padded(fill=0.).sum(axis=1) avgr = trajbatch.r.stacked.mean() lengths = np.array([len(traj) for traj in trajbatch]) ent = policy._compute_actiondist_entropy( trajbatch.adist.stacked).mean() print('ret: {} +/- {}'.format(returns.mean(), returns.std())) print('avgr: {}'.format(avgr)) print('len: {} +/- {}'.format(lengths.mean(), lengths.std())) print('ent: {}'.format(ent)) # Save the trajs to a file with h5py.File(taskname2outfile[task['name']], 'w') as f: def write(dsetname, a): f.create_dataset(dsetname, data=a, compression='gzip', compression_opts=9) # Right-padded trajectory data write('obs_B_T_Do', trajbatch.obs.padded(fill=0.)) write('a_B_T_Da', trajbatch.a.padded(fill=0.)) write('r_B_T', trajbatch.r.padded(fill=0.)) # Trajectory lengths write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32)) # # Also save args to this script # argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) # f.attrs['args'] = argstr util.header('Wrote {}'.format(taskname2outfile[task['name']]))
def phase0_sampletrajs(spec, specfilename): util.header('=== Phase 0: Sampling trajs from expert policies ===') num_trajs = spec['training']['full_dataset_num_trajs'] util.header('Sampling {} trajectories'.format(num_trajs)) # Make filenames and check if they're valid first taskname2outfile = gen_taskname2outfile(spec, assert_not_exists=True) # Sample trajs for each task for task in spec['tasks']: # Execute the policy trajbatch, policy, _ = exec_saved_policy( task['env'], task['policy'], num_trajs, deterministic=spec['training']['deterministic_expert'], max_traj_len=None) # Quick evaluation returns = trajbatch.r.padded(fill=0.).sum(axis=1) avgr = trajbatch.r.stacked.mean() lengths = np.array([len(traj) for traj in trajbatch]) ent = policy._compute_actiondist_entropy(trajbatch.adist.stacked).mean() print 'ret: {} +/- {}'.format(returns.mean(), returns.std()) print 'avgr: {}'.format(avgr) print 'len: {} +/- {}'.format(lengths.mean(), lengths.std()) print 'ent: {}'.format(ent) # Save the trajs to a file with h5py.File(taskname2outfile[task['name']], 'w') as f: def write(dsetname, a): f.create_dataset(dsetname, data=a, compression='gzip', compression_opts=9) # Right-padded trajectory data write('obs_B_T_Do', trajbatch.obs.padded(fill=0.)) write('a_B_T_Da', trajbatch.a.padded(fill=0.)) write('r_B_T', trajbatch.r.padded(fill=0.)) # Trajectory lengths write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32)) # # Also save args to this script # argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) # f.attrs['args'] = argstr util.header('Wrote {}'.format(taskname2outfile[task['name']]))
def phase1_train(spec, specfilename): util.header('=== Phase 1: training ===') # Generate array job that trains all algorithms # over all tasks, for all dataset sizes (3 loops) taskname2dset = gen_taskname2outfile(spec) # Theano GPU command prefix gpu_cmd_prefix = 'THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=gpu' # Make checkpoint dir. All outputs go here checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir']) util.mkdir_p(checkptdir) # Make sure checkpoint dir is empty assert not os.listdir( checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir) # Assemble the commands to run on the cluster cmd_templates, outputfilenames, argdicts = [], [], [] for alg in spec['training']['algorithms']: for task in spec['tasks']: for num_trajs in spec['training']['dataset_num_trajs']: assert num_trajs <= spec['training']['full_dataset_num_trajs'] for run in range(spec['training']['runs']): # A string identifier. Used in filenames for this run strid = 'alg={},task={},num_trajs={},run={}'.format( alg['name'], task['name'], num_trajs, run) # check if use gpu if spec['training']['use_gpu']: cmd_templates.append( gpu_cmd_prefix + ' ' + alg['cmd'].replace('\n', ' ').strip()) else: cmd_templates.append(alg['cmd'].replace('\n', ' ').strip()) outputfilenames.append(strid + '.txt') argdicts.append({ 'env': task['env'], 'dataset': taskname2dset[task['name']], 'num_trajs': num_trajs, 'cuts_off_on_success': int(task['cuts_off_on_success']), 'data_subsamp_freq': task['data_subsamp_freq'], 'out': os.path.join(checkptdir, strid + '.h5'), }) pbsopts = spec['options']['pbs'] # runpbs( # cmd_templates, outputfilenames, argdicts, # jobname=pbsopts['jobname'], queue=pbsopts['queue'], nodes=1, ppn=pbsopts['ppn'], # job_range=pbsopts['range'] if 'range' in pbsopts else None, # qsub_script_copy=os.path.join(checkptdir, 'qsub_script.sh') # ) runcmds(cmd_templates, outputfilenames, argdicts, jobname=pbsopts['jobname'], outputfile_dir=os.path.join( checkptdir, 'logs_%s_%s' % (pbsopts['jobname'], datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')))) # Copy the pipeline yaml file to the output dir too shutil.copyfile(specfilename, os.path.join(checkptdir, 'pipeline.yaml'))
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_traj_len', type=int, default=None) parser.add_argument('--env_name', type=str, required=True) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--enable_obsnorm', type=int, default=1) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--use_tanh', type=int, default=0) # Optimizer parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) # Sampling parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=100000) # Saving stuff parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--log', type=str, required=False) args = parser.parse_args() if args.tiny_policy or args.use_tanh: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE if args.use_tanh: arch = json.loads(args.policy_hidden_spec) for layer in arch: if layer['type'] == 'nonlin': layer['func'] = 'tanh' args.policy_hidden_spec = json.dumps(arch) print 'Modified architecture:', args.policy_hidden_spec argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=bool(args.enable_obsnorm)) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=args.policy_hidden_spec, enable_obsnorm=bool(args.enable_obsnorm)) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') util.header('Policy architecture') policy.print_trainable_variables() vf = rl.ValueFunc( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=bool(args.enable_obsnorm), enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1./mdp.env_spec.timestep_limit, varscope_name='ValueFunc') max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len opt = rl.SamplingPolicyOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=SimConfig( min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), value_func=vf, obsfeat_fn=lambda obs: obs, ) log = nn.TrainingLog(args.log, [('args', argstr)]) for i in xrange(args.max_iter): iter_info = opt.step() log.write(iter_info, print_header=i % 20 == 0) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: log.write_snapshot(policy, i)
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) parser.add_argument('--seed', type=int, default=0) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument('--limit_trajs', type=int, required=True) parser.add_argument('--data_subsamp_freq', type=int, required=True) # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # Behavioral cloning optimizer parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01) parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) parser.add_argument('--use_shared_std_network', type=int, default=0) # Generative Moment matching parser.add_argument('--kernel_batchsize', type=int, default=1000) parser.add_argument('--kernel_reg_weight', type=float, default=0.) parser.add_argument('--use_median_heuristic', type=int, default=1) parser.add_argument('--use_logscale_reward', type=int) parser.add_argument('--reward_epsilon', type=float, default=0.0001) # Auto-Encoder Information # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=0) parser.add_argument('--log', type=str, required=False) parser.add_argument('--save_reward', type=int, default=0) args = parser.parse_args() # Initialize the MDP if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy enable_obsnorm = args.obsnorm_mode != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy', bool(args.use_shared_std_network)) else: policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy', bool(args.use_shared_std_network)) util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(), )) # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq, args.seed) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating args.print_freq = args.bclone_eval_freq args.save_freq = args.bclone_eval_freq reward, vf = None, None opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o: o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), eval_freq=args.bclone_eval_freq, train_frac=args.bclone_train_frac) elif args.mode == 'ga': if args.reward_type == 'nn': reward = imitation.TransitionClassifier( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier') elif args.reward_type in ['l2ball', 'simplex']: reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool(args.no_vf) else rl.ValueFunc( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='ValueFunc') opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) elif args.mode == 'gmmil': if args.use_median_heuristic == 0: bandwidth_params = [ 1.0, 1.0 / 2.0, 1.0 / 5.0, 1.0 / 10.0, 1.0 / 40.0, 1.0 / 80.0 ] else: bandwidth_params = [] if args.reward_type == 'mmd': reward = gmmil.MMDReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked, kernel_bandwidth_params=bandwidth_params, kernel_reg_weight=args.kernel_reg_weight, kernel_batchsize=args.kernel_batchsize, use_median_heuristic=args.use_median_heuristic, use_logscale_reward=bool(args.use_logscale_reward), save_reward=bool(args.save_reward), epsilon=args.reward_epsilon) else: raise NotImplementedError(args.reward_type) vf = None if bool(args.no_vf) else rl.ValueFunc( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='ValueFunc') opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) # Run optimizer log = nn.TrainingLog(args.log, [('args', argstr)]) for i in xrange(args.max_iter): iter_info = opt.step() log.write(iter_info, print_header=i % (20 * args.print_freq) == 0, display=i % args.print_freq == 0) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: log.write_snapshot(policy, i) if args.plot_freq != 0 and i % args.plot_freq == 0: exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate( [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # Plot reward import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0, 1 range1 = (min(exdata_N_Doa[:, idx1].min(), pdata_M_Doa[:, idx1].min()), max(exdata_N_Doa[:, idx1].max(), pdata_M_Doa[:, idx1].max())) range2 = (min(exdata_N_Doa[:, idx2].min(), pdata_M_Doa[:, idx2].min()), max(exdata_N_Doa[:, idx2].max(), pdata_M_Doa[:, idx2].max())) reward.plot(ax, idx1, idx2, range1, range2, n=100) # Plot expert data ax.scatter(exdata_N_Doa[:, idx1], exdata_N_Doa[:, idx2], color='blue', s=1, label='expert') # Plot policy samples ax.scatter(pdata_M_Doa[:, idx1], pdata_M_Doa[:, idx2], color='red', s=1, label='apprentice') ax.legend() plt.show()
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument('--limit_trajs', type=int, required=True) parser.add_argument('--data_subsamp_freq', type=int, required=True) # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # add a spec for transition classifier parser.add_argument('--clf_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) # Behavioral cloning optimizer parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01) parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=100) parser.add_argument('--log', type=str, required=False) # Sequential model parser.add_argument('--seq_model', type=int, default=0) parser.add_argument('--time_step', type=int, default=10) args = parser.parse_args() # Initialize the MDP if not args.seq_model: if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) # Add sequential model else: if args.tiny_policy: assert args.policy_hidden_spec == SEQ_SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = SEQ_TINY_ARCHITECTURE # # change the default architecture to fit sequential model # if args.policy_hidden_spec == SIMPLE_ARCHITECTURE: # args.policy_hidden_spec = SEQ_SIMPLE_ARCHITECTURE if args.clf_hidden_spec == SIMPLE_ARCHITECTURE: args.clf_hidden_spec = SEQ_SIMPLE_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy enable_obsnorm = args.obsnorm_mode != 'none' if not args.seq_model: if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') # Add squential model else: if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.SeqGaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, time_step=args.time_step, # add time step min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm, enable_actnorm=False) # XXX not implement actnorm yet policy = rl.SeqGaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'SeqGaussianPolicy') else: policy_cfg = rl.SeqGibbsPolicyConfig( hidden_spec=args.policy_hidden_spec, time_step=args.time_step, # add time step enable_obsnorm=enable_obsnorm, enable_actnorm=False) # XXX not implement actnorm yet policy = rl.SeqGibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'SeqGibbsPolicy') util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(), )) # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 # print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype # print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype # print 'Debug: ext_Bstacked dtype:', ext_Bstacked.dtype # assert 1 == 0 # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print('Max traj len:', max_traj_len) if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating # args.print_freq = args.bclone_eval_freq # args.save_freq = args.bclone_eval_freq reward, vf = None, None opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o: o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len, smp_traj_len=-1), eval_freq=args. bclone_eval_freq, # XXX set a value when using bclone train_frac=args.bclone_train_frac) elif args.mode == 'ga': if args.reward_type == 'nn': reward = imitation.TransitionClassifier( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier') elif args.reward_type in ['l2ball', 'simplex']: reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool(args.no_vf) else rl.ValueFunc( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='ValueFunc') opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len, smp_traj_len=-1), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping, sequential_model=False), # add sequential model reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Add Sequential Model elif args.mode == 'sga': if args.reward_type == 'nn': reward = imitation.SequentialTransitionClassifier( hidden_spec=args.clf_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, time_step=args.time_step, # add time step enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='SequentialTransitionClassifier') # elif args.reward_type in ['l2ball', 'simplex']: # reward = imitation.LinearReward( # obsfeat_space=mdp.obs_space, # action_space=mdp.action_space, # mode=args.reward_type, # enable_inputnorm=True, # favor_zero_expert_reward=bool(args.favor_zero_expert_reward), # include_time=bool(args.reward_include_time), # time_scale=1./mdp.env_spec.timestep_limit, # exobs_Bex_Do=exobs_Bstacked_Do, # exa_Bex_Da=exa_Bstacked_Da, # ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool(args.no_vf) else rl.SequentialValueFunc( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, time_step=args.time_step, # add time step enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='SequentialValueFunc') opt = imitation.SequentialImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SeqSimConfig( min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len, time_step=args.time_step), # add time step step_func=rl.TRPO( max_kl=args.policy_max_kl, damping=args.policy_cg_damping, sequential_model=False), # XXX not use sequential trpo reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': if not args.seq_model: policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm( opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) # Add sequential model else: Bstacked, Do, T = exobs_Bstacked_Do.shape[ 0], exobs_Bstacked_Do.shape[1], args.time_step exobs_BT_Do = exobs_Bstacked_Do[:T * (Bstacked // T), :] exa_BT_Da = exa_Bstacked_Da[:T * (Bstacked // T), :] # reshape:(B*T, ...) => (B, T, ...) exobs_B_T_Do = np.reshape( exobs_BT_Do, (Bstacked // T, T, exobs_Bstacked_Do.shape[1])) exa_B_T_Da = np.reshape( exa_BT_Da, (Bstacked // T, T, exa_Bstacked_Da.shape[1])) print("Debug: exobs_Bstacked_Do:", exobs_Bstacked_Do.shape[0], exobs_Bstacked_Do.shape[1]) print("Debug: exobs_B_T_Do:", exobs_B_T_Do.shape[0], exobs_B_T_Do.shape[1], exobs_B_T_Do.shape[2]) # XXX use original policy (not sequential) policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_B_T_Do), exa_B_T_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) # Run optimizer # log = nn.TrainingLog(args.log, [('args', argstr)]) log = nn.BasicTrainingLog(args.log, [('args', argstr)]) for i in xrange(args.max_iter): iter_info = opt.step() # log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0) log.add_log(iter_info, print_header=i % (20 * args.print_freq) == 0, display=i % args.print_freq == 0) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: print('%i/%i iters is done. Save snapshot.' % (i, args.max_iter)) # log.write_snapshot(policy, i) log.write_snapshot(policy, i) if args.mode == 'ga' and args.plot_freq != 0 and i % args.plot_freq == 0: print('%i/%i iters is done. Save plot.' % (i, args.max_iter)) exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate( [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # convert dtype to follow theano config exdata_N_Doa = exdata_N_Doa.astype(theano.config.floatX) pdata_M_Doa = pdata_M_Doa.astype(theano.config.floatX) # print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype # float32 # print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype # int64 # print 'Debug: opt.last_sampbatch.obs.stacked dtype:', opt.last_sampbatch.obs.stacked.dtype # float32 # print 'Debug: opt.last_sampbatch.a.stacked dtype:', opt.last_sampbatch.a.stacked.dtype # int64 # print 'Debug: exdata_N_Doa dtype:', exdata_N_Doa.dtype # float32 # print 'Debug: pdata_M_Doa dtype:', pdata_M_Doa.dtype # float32 # Plot reward # import matplotlib # matplotlib.use('Agg') # import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0, 1 range1 = (min(exdata_N_Doa[:, idx1].min(), pdata_M_Doa[:, idx1].min()), max(exdata_N_Doa[:, idx1].max(), pdata_M_Doa[:, idx1].max())) range2 = (min(exdata_N_Doa[:, idx2].min(), pdata_M_Doa[:, idx2].min()), max(exdata_N_Doa[:, idx2].max(), pdata_M_Doa[:, idx2].max())) # print 'Debug: range1 types:', type(range1[0]), type(range1[1]) # float32, float32 # print 'Debug: range2 types:', type(range2[0]), type(range2[1]) # float32, float32 x, y, z = reward.plot(ax, idx1, idx2, range1, range2, n=100) plot = [ x, y, z, exdata_N_Doa[:, idx1], exdata_N_Doa[:, idx2], pdata_M_Doa[:, idx1], pdata_M_Doa[:, idx2] ] log.write_plot(plot, i) # Plot expert data # ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert') # Plot policy samples # ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice') # ax.legend() # plt.show() # plt.savefig() # plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]] # log.write_plot(plot, i) # if args.mode == 'sga' and args.plot_freq != 0 and i % args.plot_freq == 0: # print ('%i/%i iters is done. Save plot.' %(i, args.max_iter)) # exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) # # reshape: (B, T, ...) => (B*T, ...) ## B, T, Df = opt.last_sampbatch.obs.stacked.shape ## obs_flatten = np.reshape(opt.last_sampbatch.obs.stacked, (B*T, opt.last_sampbatch.obs.stacked.shape[2])) ## a_flatten = np.reshape(opt.last_sampbatch.a.stacked, (B*T, opt.last_sampbatch.a.stacked.shape[2])) ### pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # # convert dtype to follow theano config # exdata_N_Doa = exdata_N_Doa.astype(theano.config.floatX) # pdata_M_Doa = pdata_M_Doa.astype(theano.config.floatX) ## print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype # float32 ## print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype # int64 ## print 'Debug: opt.last_sampbatch.obs.stacked dtype:', opt.last_sampbatch.obs.stacked.dtype # float32 ## print 'Debug: opt.last_sampbatch.a.stacked dtype:', opt.last_sampbatch.a.stacked.dtype # int64 ## print 'Debug: exdata_N_Doa dtype:', exdata_N_Doa.dtype # float32 ## print 'Debug: pdata_M_Doa dtype:', pdata_M_Doa.dtype # float32 # # Plot reward ## import matplotlib ## matplotlib.use('Agg') ## import matplotlib.pyplot as plt # _, ax = plt.subplots() # idx1, idx2 = 0,1 # range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max())) # range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max())) ## print 'Debug: range1 types:', type(range1[0]), type(range1[1]) # float32, float32 ## print 'Debug: range2 types:', type(range2[0]), type(range2[1]) # float32, float32 # # for sequential model, input the length of sequence # # XXX take care of the usage of memory !! # x, y, z = reward.plot(ax, idx1, idx2, range1, range2, args.time_step, n=100) # plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]] # log.write_plot(plot, i) # # Plot expert data ## ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert') # # Plot policy samples ## ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice') ## ax.legend() ## plt.show() ## plt.savefig() ## plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]] ## log.write_plot(plot, i) # write log print('Training is done. Save log.') log.write_log() log.close()
def __init__(self, input_, new_shape): self._output_shape = tuple(new_shape) util.header('Reshape(new_shape=%s)' % (str(self._output_shape), )) with variable_scope(type(self).__name__) as self.__varscope: self._output = input_.reshape((-1, ) + self._output_shape)
def main(): """ NOTE! Don't forget that these are effectively called directly from the yaml files. They call imitate_mj.py with their own arguments, so check there if some of the values differ from the default ones. """ np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument('--limit_trajs', type=int, required=True) parser.add_argument('--data_subsamp_freq', type=int, required=True) # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # Behavioral cloning optimizer (ok ... 128 and 0.7 settings are in the paper). parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01) parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=0) parser.add_argument('--log', type=str, required=False) args = parser.parse_args() # Initialize the MDP if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy print("\n\tNow initializing the policy:") enable_obsnorm = args.obsnorm_mode != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(), )) print("\tFinished initializing the policy.\n") # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating args.print_freq = args.bclone_eval_freq args.save_freq = args.bclone_eval_freq reward, vf = None, None opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o: o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), eval_freq=args.bclone_eval_freq, train_frac=args.bclone_train_frac) elif args.mode == 'ga': if args.reward_type == 'nn': # FYI: this is the GAIL case. Note that it doesn't take in any of # the raw expert data, unlike the other reward types. And we call # them `reward types` since the optimize can use their output in # some way to impove itself. reward = imitation.TransitionClassifier( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier') elif args.reward_type in ['l2ball', 'simplex']: # FEM or game-theoretic apprenticeship learning, respectively. reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) # All three of these 'advanced' IL algorithms use neural network value # functions to reduce variance for policy gradient estimates. print("\n\tThe **VALUE** function (may have action concatenated):") vf = None if bool(args.no_vf) else rl.ValueFunc( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='ValueFunc') opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) # Run optimizer, i.e. {BehavioralCloning,Imitation}Optimizer. log = nn.TrainingLog(args.log, [('args', argstr)]) for i in xrange(args.max_iter): iter_info = opt.step() log.write(iter_info, print_header=i % (20 * args.print_freq) == 0, display=i % args.print_freq == 0) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: log.write_snapshot(policy, i) if args.plot_freq != 0 and i % args.plot_freq == 0: exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate( [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # Plot reward import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0, 1 range1 = (min(exdata_N_Doa[:, idx1].min(), pdata_M_Doa[:, idx1].min()), max(exdata_N_Doa[:, idx1].max(), pdata_M_Doa[:, idx1].max())) range2 = (min(exdata_N_Doa[:, idx2].min(), pdata_M_Doa[:, idx2].min()), max(exdata_N_Doa[:, idx2].max(), pdata_M_Doa[:, idx2].max())) reward.plot(ax, idx1, idx2, range1, range2, n=100) # Plot expert data ax.scatter(exdata_N_Doa[:, idx1], exdata_N_Doa[:, idx2], color='blue', s=1, label='expert') # Plot policy samples ax.scatter(pdata_M_Doa[:, idx1], pdata_M_Doa[:, idx2], color='red', s=1, label='apprentice') ax.legend() plt.show()
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument('--limit_trajs', type=int, required=True) parser.add_argument('--data_subsamp_freq', type=int, required=True) # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # Behavioral cloning optimizer parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01) parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=0) parser.add_argument('--log', type=str, required=False) args = parser.parse_args() # Initialize the MDP if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy enable_obsnorm = args.obsnorm_mode != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(),)) # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating args.print_freq = args.bclone_eval_freq args.save_freq = args.bclone_eval_freq reward, vf = None, None opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o:o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), eval_freq=args.bclone_eval_freq, train_frac=args.bclone_train_frac) elif args.mode == 'ga': if args.reward_type == 'nn': reward = imitation.TransitionClassifier( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1./mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier') elif args.reward_type in ['l2ball', 'simplex']: reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1./mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool(args.no_vf) else rl.ValueFunc( hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1./mdp.env_spec.timestep_limit, varscope_name='ValueFunc') opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig( min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) # Run optimizer log = nn.TrainingLog(args.log, [('args', argstr)]) for i in xrange(args.max_iter): iter_info = opt.step() log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: log.write_snapshot(policy, i) if args.plot_freq != 0 and i % args.plot_freq == 0: exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # Plot reward import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0,1 range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max())) range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max())) reward.plot(ax, idx1, idx2, range1, range2, n=100) # Plot expert data ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert') # Plot policy samples ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice') ax.legend() plt.show()
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_traj_len', type=int, default=None) parser.add_argument('--env_name', type=str, required=True) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--enable_obsnorm', type=int, default=1) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--use_tanh', type=int, default=0) # Optimizer parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) # Sampling parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=100000) # Saving stuff parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--log', type=str, required=False) args = parser.parse_args() if args.tiny_policy or args.use_tanh: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE if args.use_tanh: arch = json.loads(args.policy_hidden_spec) for layer in arch: if layer['type'] == 'nonlin': layer['func'] = 'tanh' args.policy_hidden_spec = json.dumps(arch) print('Modified architecture:', args.policy_hidden_spec) argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=bool(args.enable_obsnorm)) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec, enable_obsnorm=bool( args.enable_obsnorm)) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') util.header('Policy architecture') policy.print_trainable_variables() vf = rl.ValueFunc(hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=bool(args.enable_obsnorm), enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='ValueFunc') max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print('Max traj len:', max_traj_len) opt = rl.SamplingPolicyOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), value_func=vf, obsfeat_fn=lambda obs: obs, ) log = nn.TrainingLog(args.log, [('args', argstr)]) for i in range(args.max_iter): iter_info = opt.step() log.write(iter_info, print_header=i % 20 == 0) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: log.write_snapshot(policy, i)
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('--eval_only', action='store_true') parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving parser.add_argument('--out', type=str, default=None) parser.add_argument('--count', type=int, default=None) parser.add_argument('--deterministic', action='store_true') args = parser.parse_args() # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if args.max_traj_len is None: args.max_traj_len = mdp.env_spec.timestep_limit util.header('Max traj len is {}'.format(args.max_traj_len)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) if args.eval_only: n = 50 print 'Evaluating based on {} trajs'.format(n) if False: eval_trajbatch = mdp.sim_mp( policy_fn=lambda obs_B_Do: policy.sample_actions(obs_B_Do, args.deterministic), obsfeat_fn=lambda obs:obs, cfg=policyopt.SimConfig( min_num_trajs=n, min_total_sa=-1, batch_size=None, max_traj_len=args.max_traj_len)) returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1) avgr = eval_trajbatch.r.stacked.mean() lengths = np.array([len(traj) for traj in eval_trajbatch]) ent = policy._compute_actiondist_entropy(eval_trajbatch.adist.stacked).mean() print 'ret: {} +/- {}'.format(returns.mean(), returns.std()) print 'avgr: {}'.format(avgr) print 'len: {} +/- {}'.format(lengths.mean(), lengths.std()) print 'ent: {}'.format(ent) print returns else: returns = [] lengths = [] sim = mdp.new_sim() for i_traj in xrange(n): print i_traj, n sim.reset() totalr = 0. l = 0 while not sim.done: a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:] r = sim.step(a) totalr += r l += 1 returns.append(totalr) lengths.append(l) import IPython; IPython.embed() elif args.out is not None: # Sample trajs and write to file print 'Saving traj samples to file: {}'.format(args.out) assert not os.path.exists(args.out) assert args.count > 0 # Simulate to create a trajectory batch util.header('Sampling {} trajectories of maximum length {}'.format(args.count, args.max_traj_len)) trajs = [] for i in tqdm.trange(args.count): trajs.append(mdp.sim_single( lambda obs: policy.sample_actions(obs, args.deterministic), lambda obs: obs, args.max_traj_len)) trajbatch = policyopt.TrajBatch.FromTrajs(trajs) print print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean() # Save the trajs to a file with h5py.File(args.out, 'w') as f: def write(name, a): # chunks of 128 trajs each f.create_dataset(name, data=a, chunks=(min(128, a.shape[0]),)+a.shape[1:], compression='gzip', compression_opts=9) # Right-padded trajectory data write('obs_B_T_Do', trajbatch.obs.padded(fill=0.)) write('a_B_T_Da', trajbatch.a.padded(fill=0.)) write('r_B_T', trajbatch.r.padded(fill=0.)) # Trajectory lengths write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32)) # Also save args to this script argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) f.attrs['args'] = argstr else: # Animate sim = mdp.new_sim() raw_obs, normalized_obs = [], [] while True: sim.reset() totalr = 0. steps = 0 while not sim.done: raw_obs.append(sim.obs[None,:]) normalized_obs.append(policy.compute_internal_normalized_obsfeat(sim.obs[None,:])) a = policy.sample_actions(sim.obs[None,:], args.deterministic)[0][0,:] r = sim.step(a) totalr += r steps += 1 sim.draw() if steps % 1000 == 0: tmpraw = np.concatenate(raw_obs, axis=0) tmpnormed = np.concatenate(normalized_obs, axis=0) print 'raw mean, raw std, normed mean, normed std' print np.stack([tmpraw.mean(0), tmpraw.std(0), tmpnormed.mean(0), tmpnormed.std(0)]) print 'Steps: %d, return: %.5f' % (steps, totalr)
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument('--resume_training', action='store_true', help="Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf") parser.add_argument('--checkpoint', type=str, help="Load from checkpoint if provided and if --resume_training") parser.add_argument('--limit_trajs', type=int, required=True, help="How many expert trajectories to be used for training. If None : full dataset is used.") parser.add_argument('--data_subsamp_freq', type=int, required=True, help="A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)") # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # Behavioral cloning optimizer parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01) parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=0) parser.add_argument('--log', type=str, required=False) args = parser.parse_args() # Initialize the MDP if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) print "\n\n========== Policy network specifications loaded ===========\n\n" mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) print "\n\n========== MDP initialized ===========\n\n" # Initialize the policy enable_obsnorm = args.obsnorm_mode != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, policy_key = util.split_h5_name(args.checkpoint) policy_file = file[:-3]+'_policy.h5' policy.load_h5(policy_file, policy_key) util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(),)) print "\n\n========== Policy initialized ===========\n\n" # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 print "\n\n========== Expert data loaded ===========\n\n" # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating args.print_freq = args.bclone_eval_freq args.save_freq = args.bclone_eval_freq reward, vf = None, None #There is no role of the reward function or value function in behavioral cloning opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o:o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), eval_freq=args.bclone_eval_freq, train_frac=args.bclone_train_frac) print "======= Behavioral Cloning optimizer initialized =======" elif args.mode == 'ga': if args.reward_type == 'nn': reward = imitation.TransitionClassifier( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1./mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier') #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, reward_key = util.split_h5_name(args.checkpoint) reward_file = file[:-3]+'_reward.h5' print reward_file reward.load_h5(reward_file, reward_key) elif args.reward_type in ['l2ball', 'simplex']: reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1./mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool(args.no_vf) else rl.ValueFunc( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1./mdp.env_spec.timestep_limit, varscope_name='ValueFunc') if args.resume_training: if args.checkpoint is not None: file, vf_key = util.split_h5_name(args.checkpoint) vf_file = file[:-3]+'_vf.h5' vf.load_h5(vf_file, vf_key) opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig( min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) print "======== Observation normalization done ========" # Run optimizer print "======== Optimization begins ========" # Trial: make checkpoints for policy, reward and vf policy_log = nn.TrainingLog(args.log[:-3]+'_policy.h5', [('args', argstr)]) reward_log = nn.TrainingLog(args.log[:-3]+'_reward.h5', [('args', argstr)]) vf_log = nn.TrainingLog(args.log[:-3]+'_vf.h5', [('args', argstr)]) for i in xrange(args.max_iter): #Optimization step iter_info = opt.step() #Log and plot #pdb.set_trace() policy_log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0 ## FIXME: AS remove comment ) reward_log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0 ## FIXME: AS remove comment ) vf_log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0 ## FIXME: AS remove comment ) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: policy_log.write_snapshot(policy, i) reward_log.write_snapshot(reward, i) vf_log.write_snapshot(vf, i) if args.plot_freq != 0 and i % args.plot_freq == 0: exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # Plot reward import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0,1 range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max())) range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max())) reward.plot(ax, idx1, idx2, range1, range2, n=100) # Plot expert data ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert') # Plot policy samples ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice') ax.legend() plt.show()
def main(): """ If we have trained policies and snapshots, I think we can use this to watch videos of our agent in action. I don't think I can use this without doing some training first. This doesn't do training itself; we need to provide a policy, but the h5 file has to also be a directory which contains other information (see the yaml files for what I believe are similar examples). I'm not sure why we have rl giving us Gaussian policies vs Gibbs policies. What's the difference? They should just be functions mapping from states to actions? After that, it seems like we're just simulating stuff and hopefully a video would appear if I can get this to run. """ np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('output_dir', type=str) parser.add_argument('--deterministic', default=1, type=int) parser.add_argument('--max_steps', type=int, required=True) parser.add_argument('--env_name', type=str, default=None) args = parser.parse_args() util.mkdir_p(args.output_dir) assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir print 'Writing to', args.output_dir # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] if args.env_name is None else args.env_name print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) util.header('Max steps is {}'.format(args.max_steps)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Animate sim = mdp.new_sim() steps = 0 exit = False while not exit: sim.reset() while not sim.done: a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:] sim.step(a) sim.draw() viewer = sim.env.viewer data, w, h = viewer.get_image() image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:] cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1]) print steps steps += 1 if steps >= args.max_steps: exit = True break
def print_trainable_variables(self): for v in self.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (self.get_num_params(), ))
def phase2_eval(spec, specfilename): util.header('=== Phase 2: evaluating trained models ===') import pandas as pd taskname2dset = gen_taskname2outfile(spec) # This is where model logs are stored. # We will also store the evaluation here. checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir']) print 'Evaluating results in {}'.format(checkptdir) results_full_path = os.path.join(checkptdir, spec['options']['results_filename']) print 'Will store results in {}'.format(results_full_path) if os.path.exists(results_full_path): raise RuntimeError('Results file {} already exists'.format(results_full_path)) # First, pre-determine which evaluations we have to do evals_to_do = [] nonexistent_checkptfiles = [] for task in spec['tasks']: # See how well the algorithms did... for alg in spec['training']['algorithms']: # ...on various dataset sizes for num_trajs in spec['training']['dataset_num_trajs']: # for each rerun, for mean / error bars later for run in range(spec['training']['runs']): # Make sure the checkpoint file exists (maybe PBS dropped some jobs) strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run) checkptfile = os.path.join(checkptdir, strid + '.h5') if not os.path.exists(checkptfile): nonexistent_checkptfiles.append(checkptfile) evals_to_do.append((task, alg, num_trajs, run, checkptfile)) if nonexistent_checkptfiles: print 'Cannot find checkpoint files:\n', '\n'.join(nonexistent_checkptfiles) raise RuntimeError # Walk through all saved checkpoints collected_results = [] for i_eval, (task, alg, num_trajs, run, checkptfile) in enumerate(evals_to_do): util.header('Evaluating run {}/{}: alg={},task={},num_trajs={},run={}'.format( i_eval+1, len(evals_to_do), alg['name'], task['name'], num_trajs, run)) # Load the task's traj dataset to see how well the expert does with h5py.File(taskname2dset[task['name']], 'r') as trajf: # Expert's true return and traj lengths ex_traj_returns = trajf['r_B_T'][...].sum(axis=1) ex_traj_lengths = trajf['len_B'][...] # Load the checkpoint file with pd.HDFStore(checkptfile, 'r') as f: log_df = f['log'] log_df.set_index('iter', inplace=True) # Evaluate true return for the learned policy if alg['name'] == 'bclone': # Pick the policy with the best validation accuracy best_snapshot_idx = log_df['valacc'].argmax() alg_traj_returns, alg_traj_lengths = eval_snapshot( task['env'], checkptfile, best_snapshot_idx, spec['options']['eval_num_trajs'], deterministic=True) elif any(alg['name'].startswith(s) for s in ('ga', 'fem', 'simplex')): # Evaluate the last saved snapshot snapshot_names = f.root.snapshots._v_children.keys() assert all(name.startswith('iter') for name in snapshot_names) snapshot_inds = sorted([int(name[len('iter'):]) for name in snapshot_names]) best_snapshot_idx = snapshot_inds[-1] alg_traj_returns, alg_traj_lengths = eval_snapshot( task['env'], checkptfile, best_snapshot_idx, spec['options']['eval_num_trajs'], deterministic=True) else: raise NotImplementedError('Analysis not implemented for {}'.format(alg['name'])) collected_results.append({ # Trial info 'alg': alg['name'], 'task': task['name'], 'num_trajs': num_trajs, 'run': run, # Expert performance 'ex_traj_returns': ex_traj_returns, 'ex_traj_lengths': ex_traj_lengths, # Learned policy performance 'alg_traj_returns': alg_traj_returns, 'alg_traj_lengths': alg_traj_lengths, }) collected_results = pd.DataFrame(collected_results) with pd.HDFStore(results_full_path, 'w') as outf: outf['results'] = collected_results
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('--eval_only', action='store_true') parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving parser.add_argument('--out', type=str, default=None) parser.add_argument('--count', type=int, default=None) parser.add_argument('--deterministic', action='store_true') args = parser.parse_args() #filenames = os.listdir(args.policy) csvf = open(args.policy[:-3] + '.csv', 'w') csvwriter = csv.writer(csvf) dataf = open(args.policy[:-3] + 'full.csv', 'w') datawriter = csv.writer(dataf) #csvwriter.writerow(['filename', 'average', 'std']) # Load the saved state if args.policy.find('reacher') > 0: key_iter = 200 elif args.policy.find('humanoid') > 0: key_iter = 1500 else: key_iter = 500 policy_file, policy_key = util.split_h5_name(args.policy + '/snapshots/iter%07d' % key_iter) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) if args.policy.find('shared1') > 0: sharednet = True else: sharednet = False # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if args.max_traj_len is None: args.max_traj_len = mdp.env_spec.timestep_limit util.header('Max traj len is {}'.format(args.max_traj_len)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm'] ) if 'enable_obsnorm' in train_args else train_args[ 'obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy', use_shared_std_network=sharednet) else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy', use_shared_std_network=sharednet) policy.load_h5(policy_file, policy_key) n = 50 print 'Evaluating based on {} trajs'.format(n) returns = [] lengths = [] sim = mdp.new_sim() for i_traj in xrange(n): iteration = 0 sim.reset() totalr = 0. l = 0 while not sim.done and iteration < args.max_traj_len: a = policy.sample_actions(sim.obs[None, :], bool(args.deterministic))[0][0, :] r = sim.step(a) totalr += r l += 1 iteration += 1 print i_traj, n, totalr, iteration datawriter.writerow([i_traj, n, totalr, iteration]) returns.append(totalr) lengths.append(l) avg, std = np.array(returns).mean(), np.array(returns).std() print 'Avg Return: ', avg, 'Std: ', std csvwriter.writerow([args.policy, avg, std]) del policy #import IPython; IPython.embed() csvf.close() dataf.close()
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('--eval_only', action='store_true') parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving parser.add_argument('--out', type=str, default=None) parser.add_argument('--count', type=int, default=None) parser.add_argument('--deterministic', action='store_true') args = parser.parse_args() # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if args.max_traj_len is None: args.max_traj_len = mdp.env_spec.timestep_limit util.header('Max traj len is {}'.format(args.max_traj_len)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm'] ) if 'enable_obsnorm' in train_args else train_args[ 'obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) if args.eval_only: n = 50 print 'Evaluating based on {} trajs'.format(n) if False: eval_trajbatch = mdp.sim_mp( policy_fn=lambda obs_B_Do: policy.sample_actions( obs_B_Do, args.deterministic), obsfeat_fn=lambda obs: obs, cfg=policyopt.SimConfig(min_num_trajs=n, min_total_sa=-1, batch_size=None, max_traj_len=args.max_traj_len)) returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1) avgr = eval_trajbatch.r.stacked.mean() lengths = np.array([len(traj) for traj in eval_trajbatch]) ent = policy._compute_actiondist_entropy( eval_trajbatch.adist.stacked).mean() print 'ret: {} +/- {}'.format(returns.mean(), returns.std()) print 'avgr: {}'.format(avgr) print 'len: {} +/- {}'.format(lengths.mean(), lengths.std()) print 'ent: {}'.format(ent) print returns else: returns = [] lengths = [] sim = mdp.new_sim() for i_traj in xrange(n): iteration = 0 sim.reset() totalr = 0. l = 0 while not sim.done and iteration < args.max_traj_len: a = policy.sample_actions(sim.obs[None, :], bool( args.deterministic))[0][0, :] r = sim.step(a) totalr += r l += 1 iteration += 1 print i_traj, n, totalr, iteration returns.append(totalr) lengths.append(l) print 'Avg Return: ', np.array(returns).mean() print 'Std Return: ', np.array(returns).std() #import IPython; IPython.embed() elif args.out is not None: # Sample trajs and write to file print 'Saving traj samples to file: {}'.format(args.out) assert not os.path.exists(args.out) assert args.count > 0 # Simulate to create a trajectory batch util.header('Sampling {} trajectories of maximum length {}'.format( args.count, args.max_traj_len)) trajs = [] for i in tqdm.trange(args.count): trajs.append( mdp.sim_single( lambda obs: policy.sample_actions(obs, args.deterministic), lambda obs: obs, args.max_traj_len)) trajbatch = policyopt.TrajBatch.FromTrajs(trajs) print print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean() # Save the trajs to a file with h5py.File(args.out, 'w') as f: def write(name, a): # chunks of 128 trajs each f.create_dataset(name, data=a, chunks=(min(128, a.shape[0]), ) + a.shape[1:], compression='gzip', compression_opts=9) # Right-padded trajectory data write('obs_B_T_Do', trajbatch.obs.padded(fill=0.)) write('a_B_T_Da', trajbatch.a.padded(fill=0.)) write('r_B_T', trajbatch.r.padded(fill=0.)) # Trajectory lengths write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32)) # Also save args to this script argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) f.attrs['args'] = argstr else: # Animate sim = mdp.new_sim() raw_obs, normalized_obs = [], [] tret_list = [] iteration = 0 while iteration < 50: sim.reset() totalr = 0. steps = 0 while not sim.done: raw_obs.append(sim.obs[None, :]) normalized_obs.append( policy.compute_internal_normalized_obsfeat( sim.obs[None, :])) a = policy.sample_actions(sim.obs[None, :], args.deterministic)[0][0, :] r = sim.step(a) totalr += r steps += 1 sim.draw() if steps % args.max_traj_len == 0: tmpraw = np.concatenate(raw_obs, axis=0) tmpnormed = np.concatenate(normalized_obs, axis=0) print 'raw mean, raw std, normed mean, normed std' print np.stack([ tmpraw.mean(0), tmpraw.std(0), tmpnormed.mean(0), tmpnormed.std(0) ]) break print 'Steps: %d, return: %.5f' % (steps, totalr) tret_list.append(totalr) iteration += 1 print 'Avg Return: ', np.array(tret_list).mean() print 'Std Return: ', np.array(tret_list).std()
def phase2_eval(spec, specfilename): util.header('=== Phase 2: evaluating trained models ===') import pandas as pd taskname2dset = gen_taskname2outfile(spec) # This is where model logs are stored. # We will also store the evaluation here. checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir']) print 'Evaluating results in {}'.format(checkptdir) results_full_path = os.path.join(checkptdir, spec['options']['results_filename']) print 'Will store results in {}'.format(results_full_path) if os.path.exists(results_full_path): raise RuntimeError('Results file {} already exists'.format(results_full_path)) # First, pre-determine which evaluations we have to do evals_to_do = [] nonexistent_checkptfiles = [] for task in spec['tasks']: # See how well the algorithms did... for alg in spec['training']['algorithms']: # ...on various dataset sizes for num_trajs in spec['training']['dataset_num_trajs']: # for each rerun, for mean / error bars later for run in range(spec['training']['runs']): # Make sure the checkpoint file exists (maybe PBS dropped some jobs) strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run) checkptfile = os.path.join(checkptdir, strid + '.h5') if not os.path.exists(checkptfile): nonexistent_checkptfiles.append(checkptfile) evals_to_do.append((task, alg, num_trajs, run, checkptfile)) if nonexistent_checkptfiles: print 'Cannot find checkpoint files:\n', '\n'.join(nonexistent_checkptfiles) raise RuntimeError # Walk through all saved checkpoints collected_results = [] for i_eval, (task, alg, num_trajs, run, checkptfile) in enumerate(evals_to_do): util.header('Evaluating run {}/{}: alg={},task={},num_trajs={},run={}'.format( i_eval+1, len(evals_to_do), alg['name'], task['name'], num_trajs, run)) # Load the task's traj dataset to see how well the expert does with h5py.File(taskname2dset[task['name']], 'r') as trajf: # Expert's true return and traj lengths ex_traj_returns = trajf['r_B_T'][...].sum(axis=1) ex_traj_lengths = trajf['len_B'][...] # Load the checkpoint file with pd.HDFStore(checkptfile, 'r') as f: log_df = f['log'] log_df.set_index('iter', inplace=True) # Evaluate true return for the learned policy if alg['name'] == 'bclone': # Pick the policy with the best validation accuracy best_snapshot_idx = log_df['valacc'].argmax() alg_traj_returns, alg_traj_lengths = eval_snapshot( task['env'], checkptfile, best_snapshot_idx, spec['options']['eval_num_trajs'], deterministic=True) elif any(alg['name'].startswith(s) for s in ('ga', 'fem', 'simplex')): # Evaluate the last saved snapshot snapshot_names = f.root.snapshots._v_children.keys() assert all(name.startswith('iter') for name in snapshot_names) snapshot_inds = sorted([int(name[len('iter'):]) for name in snapshot_names]) best_snapshot_idx = snapshot_inds[-1] alg_traj_returns, alg_traj_lengths = eval_snapshot( task['env'], checkptfile, best_snapshot_idx, spec['options']['eval_num_trajs'], deterministic=True) else: raise NotImplementedError('Analysis not implemented for {}'.format(alg['name'])) collected_results.append({ # Trial info 'alg': alg['name'], 'task': task['name'], 'num_trajs': num_trajs, 'run': run, # Expert performance 'ex_traj_returns': ex_traj_returns, 'ex_traj_lengths': ex_traj_lengths, # Learned policy performance 'alg_traj_returns': alg_traj_returns, 'alg_traj_lengths': alg_traj_lengths, }) collected_results = pd.DataFrame(collected_results) with pd.HDFStore(results_full_path, 'w') as outf: outf['results'] = collected_results
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('output_dir', type=str) parser.add_argument('--deterministic', default=1, type=int) parser.add_argument('--max_steps', type=int, required=True) parser.add_argument('--env_name', type=str, default=None) args = parser.parse_args() util.mkdir_p(args.output_dir) assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir print 'Writing to', args.output_dir # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] if args.env_name is None else args.env_name print 'Loading environment', env_name mdp = rllabenv.RLLabMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) util.header('Max steps is {}'.format(args.max_steps)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Animate sim = mdp.new_sim() steps = 0 exit = False while not exit: sim.reset() while not sim.done: a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:] sim.step(a) sim.draw() viewer = sim.env.viewer data, w, h = viewer.get_image() image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:] cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1]) print steps steps += 1 if steps >= args.max_steps: exit = True break
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument( '--resume_training', action='store_true', help= "Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf" ) parser.add_argument( '--checkpoint', type=str, help="Load from checkpoint if provided and if --resume_training") parser.add_argument( '--limit_trajs', type=int, required=True, help= "How many expert trajectories to be used for training. If None : full dataset is used." ) parser.add_argument( '--data_subsamp_freq', type=int, required=True, help= "A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)" ) # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # Behavioral cloning optimizer parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1, help="TRPO parameter") parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01, help="TRPO parameter") parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=0) parser.add_argument('--log', type=str, required=False) # CVaR parameters parser.add_argument('--useCVaR', action='store_true') parser.add_argument('--CVaR_alpha', type=float, default=0.9) parser.add_argument('--CVaR_beta', type=float, default=0.) parser.add_argument('--CVaR_lr', type=float, default=0.01) # !!! The following argument --disc_CVaR_weight is not of use and should be removed parser.add_argument( '--disc_CVaR_weight', type=float, default=1., help= "Weight given to CVaR loss for the discriminator. Added by Anirban for smooth convergence." ) parser.add_argument('--CVaR_Lambda_not_trainable', action='store_false') parser.add_argument('--CVaR_Lambda_val_if_not_trainable', type=float, default=0.5) #Filtering expert trajectories parser.add_argument('--use_expert_traj_filtering', action='store_true') parser.add_argument('--expert_traj_filt_percentile_threshold', type=float, default=20) # Additive state prior formulation parser.add_argument('--use_additiveStatePrior', action='store_true') parser.add_argument('--additiveStatePrior_weight', type=float, default=1.) parser.add_argument('--n_gmm_components', type=int, default=5) parser.add_argument('--cov_type_gmm', type=str, default='diag') parser.add_argument('--familiarity_alpha', type=float, default=10000000) parser.add_argument('--familiarity_beta', type=float, default=100) parser.add_argument('--kickThreshold_percentile', type=float, default=100.0) parser.add_argument('--appendFlag', action='store_true') args = parser.parse_args() if args.useCVaR: print ">>>>>>>>>>>>>>>>>>> TRAINING RAIL <<<<<<<<<<<<<<<<<<<" elif args.use_additiveStatePrior: print ">>>>>>>>>>>>>>>>>>> USING ADDITIVE STATE PRIOR <<<<<<<<<<<<<<<<<<<" else: print ">>>>>>>>> TRAINING GAIL <<<<<<<<<<" # Initialize the MDP if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) print "\n\n========== Policy network specifications loaded ===========\n\n" mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) print "\n\n========== MDP initialized ===========\n\n" # Initialize the policy enable_obsnorm = args.obsnorm_mode != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy', args.useCVaR) else: policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy', args.useCVaR) offset = 0 #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, policy_key = util.split_h5_name(args.checkpoint) offset = int(policy_key.split('/')[-1][4:]) print '\n**************************************************' print 'Resuming from checkpoint : %d of %s' % (offset, file) print '**************************************************\n' if args.appendFlag and file != args.log: raise RuntimeError( 'Log file and checkpoint should have the same name if appendFlag is on. %s vs %s' % file, args.log) policy_file = file[:-3] + '_policy.h5' # Because we're naming the file as *_policy.h5 itself policy.load_h5(policy_file, policy_key) util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(), )) print "\n\n========== Policy initialized ===========\n\n" # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq, len_filtering=args.use_expert_traj_filtering, len_filter_threshold=args.expert_traj_filt_percentile_threshold) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 print "\n\n========== Expert data loaded ===========\n\n" print '\n==================== Hyperparams ====================' print '\texpert_traj_filt_percentile_threshold = %f' % args.expert_traj_filt_percentile_threshold print '\tfamiliarity_alpha = %f' % args.familiarity_alpha print '\tfamiliarity_beta = %f' % args.familiarity_beta print '\tkickThreshold_percentile = %f' % args.kickThreshold_percentile print '==============================================\n' # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating args.print_freq = args.bclone_eval_freq args.save_freq = args.bclone_eval_freq reward, vf = None, None #There is no role of the reward function or value function in behavioral cloning opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o: o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), eval_freq=args.bclone_eval_freq, train_frac=args.bclone_train_frac) print "======= Behavioral Cloning optimizer initialized =======" elif args.mode == 'ga': if args.reward_type == 'nn': reward = imitation.TransitionClassifier( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier', useCVaR=args.useCVaR, CVaR_loss_weightage=args.disc_CVaR_weight) #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, reward_key = util.split_h5_name(args.checkpoint) reward_file = file[:-3] + '_reward.h5' print reward_file reward.load_h5(reward_file, reward_key) elif args.reward_type in ['l2ball', 'simplex']: reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool( args.no_vf) else rl.ValueFunc( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='ValueFunc') if args.resume_training: if args.checkpoint is not None: file, vf_key = util.split_h5_name(args.checkpoint) vf_file = file[:-3] + '_vf.h5' vf.load_h5(vf_file, vf_key) if args.useCVaR: opt = imitation.ImitationOptimizer_CVaR( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping, useCVaR=True), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked, #For CVaR CVaR_alpha=args.CVaR_alpha, CVaR_beta=args.CVaR_beta, CVaR_lr=args.CVaR_lr, CVaR_Lambda_trainable=args.CVaR_Lambda_not_trainable, CVaR_Lambda_val_if_not_trainable=args. CVaR_Lambda_val_if_not_trainable, offset=offset + 1) elif args.use_additiveStatePrior: opt = imitation.ImitationOptimizer_additiveStatePrior( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping, useCVaR=False), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked, n_gmm_components=args.n_gmm_components, cov_type_gmm=args.cov_type_gmm, additiveStatePrior_weight=args.additiveStatePrior_weight, alpha=args.familiarity_alpha, beta=args.familiarity_beta, kickThreshold_percentile=args.kickThreshold_percentile, offset=offset + 1) else: opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping, useCVaR=False), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) print "======== Observation normalization done ========" # Run optimizer print "======== Optimization begins ========" # Trial: make checkpoints for policy, reward and vf policy_log = nn.TrainingLog(args.log[:-3] + '_policy.h5', [('args', argstr)], args.appendFlag) reward_log = nn.TrainingLog(args.log[:-3] + '_reward.h5', [('args', argstr)], args.appendFlag) vf_log = nn.TrainingLog(args.log[:-3] + '_vf.h5', [('args', argstr)], args.appendFlag) kickStatesData = [] print '\n**************************************' print 'Running iterations from %d to %d' % (offset + 1, args.max_iter) for i in xrange(offset + 1, args.max_iter): # for i in range(1): #FIXME: this is just for studying the insides of the training algo # All training a.k.a. optimization happens in the next line!!! -_- # pdb.set_trace() iter_info = opt.step( i, kickStatesData) if args.use_additiveStatePrior else opt.step(i) #========= The rest is fluff ============= #Log and plot #pdb.set_trace() policy_log.write( iter_info, print_header=i % (20 * args.print_freq) == 0, # display=False display=i % args.print_freq == 0 ## FIXME: AS remove comment ) # reward_log.write(iter_info, # print_header=i % (20*args.print_freq) == 0, # display=False # # display=i % args.print_freq == 0 ## FIXME: AS remove comment # ) # vf_log.write(iter_info, # print_header=i % (20*args.print_freq) == 0, # display=False # # display=i % args.print_freq == 0 ## FIXME: AS remove comment # ) #FIXME: problem running this on 211 and 138. No problem on 151 if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: policy_log.write_snapshot(policy, i) reward_log.write_snapshot(reward, i) vf_log.write_snapshot(vf, i) # analysisFile=open(args.log[:-3]+'_kickedStates' + str(i) + '.pkl', 'wb') analysisFile = open(args.log[:-3] + '_kickedStates.pkl', 'wb') pkl.dump({'kickStatesData': kickStatesData}, analysisFile, protocol=2) analysisFile.close() if args.plot_freq != 0 and i % args.plot_freq == 0: exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate( [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # Plot reward import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0, 1 range1 = (min(exdata_N_Doa[:, idx1].min(), pdata_M_Doa[:, idx1].min()), max(exdata_N_Doa[:, idx1].max(), pdata_M_Doa[:, idx1].max())) range2 = (min(exdata_N_Doa[:, idx2].min(), pdata_M_Doa[:, idx2].min()), max(exdata_N_Doa[:, idx2].max(), pdata_M_Doa[:, idx2].max())) reward.plot(ax, idx1, idx2, range1, range2, n=100) # Plot expert data ax.scatter(exdata_N_Doa[:, idx1], exdata_N_Doa[:, idx2], color='blue', s=1, label='expert') # Plot policy samples ax.scatter(pdata_M_Doa[:, idx1], pdata_M_Doa[:, idx2], color='red', s=1, label='apprentice') ax.legend() plt.show()