示例#1
0
def expert(expert_policy_file,
           envname,
           render=False,
           max_timesteps=None,
           num_rollouts=20):
    # import argparse
    # parser = argparse.ArgumentParser()
    # parser.add_argument('expert_policy_file', type=str)
    # parser.add_argument('envname', type=str)
    # parser.add_argument('--render', action='store_true')
    # parser.add_argument("--max_timesteps", type=int)
    # parser.add_argument('--num_rollouts', type=int, default=20,
    #                     help='Number of expert roll outs')
    # args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        # import gym
        env = gym.make(envname)
        max_steps = max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                # if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        # print('returns', returns)
        # print('mean return', np.mean(returns))
        # print('std of return', np.std(returns))

        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions)
        }
        return expert_data
示例#2
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts',
                        type=int,
                        default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('loading and building expert policy')
    with open(args.expert_policy_file, 'r') as f:
        globals()['__name__'] = 'foo'
        exec(f.read(), globals())
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        policy = SmallReactivePolicy(env.observation_space, env.action_space)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy.act(obs)
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions)
        }
示例#3
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int, default=1000)
    parser.add_argument('--num_rollouts', type=int, default=100)
    parser.add_argument('--store_data', action='store_true')
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions)
        }

        if args.store_data:
            with open(os.path.join('expert_data', args.envname + '.pkl'),
                      'wb') as f:
                pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            this_obs=[]
            this_act=[]
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                this_obs.append(obs)
                this_act.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)
            observations.append(this_obs)
            actions.append(this_act)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
        print( (np.array(observations)).shape)
        print( (np.array(actions)).shape)
        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions), 'returns': np.array(returns)}
        with open('/Users/joker/imitation_learning/hopper_policy.pickle', 'wb') as handle:
            pickle.dump(expert_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
示例#5
0
def get_data(args, init_observations=None, render=True):
    # if init_observations is None ---> generates expert data
    # if init_observations are fed ---> returns expert actions
    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')
    if init_observations is not None:
        print('initial observations: ', init_observations.shape)
    else:
        print('No initial observations: ')
    with tf.Session():
        tf_util.initialize()
        import gym
        env = gym.make(args.envname)
        obs = env.reset()
        max_steps = args.max_timesteps or env.spec.timestep_limit
        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            done = False
            totalr = 0.
            steps = 0
            while not done:
                if init_observations is not None:
                    obs = init_observations[steps]
                action = policy_fn(np.array(obs[None, :]))
                # print(action.shape)
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                # print(r)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if init_observations is not None:
                    done = False
                    if steps == len(init_observations):
                        break
                else:
                    if steps >= max_steps:
                        break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions)
        }

        return expert_data
示例#6
0
def run_expert(envname,
               render,
               expert_policy_file,
               max_timesteps,
               num_rollouts,
               store=False):
    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(envname)
        max_steps = max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions),
            'returns': np.array(returns)
        }

        if store:
            with open(
                    os.path.join('expert_data/{}-{}.pkl'.format(
                        envname, num_rollouts)), 'wb') as f:
                pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)

        return returns, expert_data
示例#7
0
def run_expert_on_observations(observations, expert_policy_file):
    policy_fn = load_policy.load_policy(expert_policy_file)
    with tf.Session():
        tf_util.initialize()
        actions = []
        for obs in observations:
            action = policy_fn(obs[None, :])
            actions.append(action)
    return np.array(actions)
示例#8
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=50,
                        help='Number of expert roll outs')
    parser.add_argument('--verbose', type=int, choices=[0,1,2], default=1,
                        help='Verbose')
    args = parser.parse_args()
    env = gym.make(args.envname)

    max_steps = args.max_timesteps or env.spec.timestep_limit

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()
        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            # print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action[0])
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                # if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}

    student = Network(env)
 
    print("Behavior Cloning....")
    student.train(expert_data['observations'], expert_data['actions'], 300, 128, args.verbose)

    print("Generating rollouts from new model..")
    generate_rollouts(env, student, max_steps, args.num_rollouts, args.render, args.verbose)
示例#9
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_data_file', type=str)
    parser.add_argument('expert_norm_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_norm_file)
    print('loaded and built')

    data = load_data(args.expert_data_file)
    X, y = data['observations'], data['actions']
    norm_data = load_data(args.expert_norm_file)
    normed_X = norm_obs(X[None, :], norm_data)

    with tf.Session():
        tf_util.initialize()

        # import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        model = train(normed_X.squeeze(), y.squeeze(), env, norm_data, policy_fn)
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                x_input = norm_obs(obs[None, :], norm_data)
                action = model.predict(x_input)
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0:
                    print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
示例#10
0
def behavior_cloning(env_name=None,
                     expert_policy_file=None,
                     num_rollouts=10,
                     max_timesteps=None,
                     num_epochs=100,
                     save=None):

    tf.reset_default_graph()

    env = gym.make(env_name)
    max_steps = max_timesteps or env.spec.timestep_limit

    print('[BA] Loading and building expert policy')
    expert_policy_fn = load_policy.load_policy(expert_policy_file)

    print('[BA] Gather experience...')
    data = gather_expert_experience(num_rollouts, env, expert_policy_fn,
                                    max_steps)

    print('[BA] Expert\'s reward mean: {:4f}({:4f})'.format(
        np.mean(data['returns']), np.std(data['returns'])))

    print('[BA] Building cloning policy')
    policy = Policy(env, data['observations'])

    with tf.Session():
        tf_util.initialize()

        for epoch in tqdm(range(num_epochs)):
            num_samples = data['observations'].shape[0]
            perm = np.random.permutation(num_samples)

            obs_samples = data['observations'][perm]
            action_samples = data['actions'][perm]

            loss = 0.
            for k in range(0, obs_samples.shape[0], BATCH_SIZE):
                loss += policy.update(obs_samples[k:k + BATCH_SIZE],
                                      action_samples[k:k + BATCH_SIZE])

            new_exp = policy.test_run(env, max_steps)
            tqdm.write('[BA] Epoch {:3d}, Loss {:4f}, Reward {:4f}'.format(
                epoch, loss / num_samples, new_exp['reward']))

        if save is not None:
            env = wrappers.Monitor(env, save, force=True)

        results = []
        for _ in tqdm(range(num_rollouts)):
            results.append(policy.test_run(env, max_steps)['reward'])

        print('[BA] Reward mean & std of cloned policy: {:4f}({:4f})'.format(
            np.mean(results), np.std(results)))

    return np.mean(data['returns']), np.std(
        data['returns']), np.mean(results), np.std(results)
示例#11
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_data', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts',
                        type=int,
                        default=20,
                        help='Number of expert roll outs')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--train_epochs', type=int, default=10)
    args = parser.parse_args()

    with tf.Session():
        tf_util.initialize()

        with open(os.path.join('expert_data', args.envname + '.pkl'),
                  "rb") as file:
            expert_data = pickle.load(file)

        # clone the observations
        observations, actions = expert_data['observations'], expert_data[
            'actions']
        print('actions', actions.shape)
        model = build_model(num_actions=actions.shape[-1])
        model.fit(observations, actions[:, 0, :], epochs=args.train_epochs)

        # rollout the cloned model
        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                obs = np.expand_dims(obs, 0)
                action = model.predict(obs)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break

            returns.append(totalr)

        print('returns', returns)
        print('mean', np.mean(returns), 'std', np.std(returns))
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
		#print action,action.shape
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
		#print "done",steps
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}
	file1=open("/home/sumuk/Desktop/sumuk/homework-master/hw1/data.pkl","wb")
        pickle.dump(expert_data,file1)
        file1.close()
示例#13
0
def expert(envname,
           dagger_step,
           num_rollouts,
           max_timesteps=None,
           render=False):
    print('generating expert data ...')
    imitator = tf_reset()
    input_ph, output_ph, output_pred = create_model()
    saver = tf.train.Saver()
    saver.restore(imitator, "dagger/%s.ckpt" % envname)
    policy_fn = load_policy.load_policy('experts/' + envname + '.pkl')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(envname)
        max_steps = max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                imitation = imitator.run(output_pred,
                                         feed_dict={input_ph: obs[None, :]})
                obs, r, done, _ = env.step(imitation)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps >= max_steps:
                    break
            returns.append(totalr)
        mean = np.mean(returns)
        std = np.std(returns)
        print("mean %d, std %d" % (mean, std))
        with open('expert_data/' + envname + '_' + str(dagger_step) + '.pkl',
                  'rb') as f:
            expert_data = pickle.loads(f.read())
        expert_data['observations'] = np.concatenate(
            (expert_data['observations'], np.array(observations)))
        expert_data['actions'] = np.concatenate(
            (expert_data['actions'], np.array(actions)))
        with open(
                'expert_data/' + envname + '_' + str(dagger_step + 1) + '.pkl',
                'wb') as f:
            pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
        return mean, std
示例#14
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('rollout_data', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    parser.add_argument('--num_epochs', type=int, default=50)
    parser.add_argument('--log_dir', type=str)
    args = parser.parse_args()

    print('loading rollout data')
    with open(args.rollout_data, 'rb') as f:
        data = pickle.loads(f.read())
    observation_data = np.array(data['observations'])
    action_data = np.array(data['actions'])

    print('training supervised model')
    import gym
    env = gym.make(args.envname)
    cloning_policy = Model(env)
    cloning_policy.train(observation_data,action_data,args.num_epochs)

    print('running policy')
    with tf.Session():
        tf_util.initialize()
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = cloning_policy.predict(obs)
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
示例#15
0
def get_expert_data(args):
    # TODO: docstring
    if args.load_expert_data:
        expert_data = load_expert_data(args.load_expert_data)
    else:
        env, _ = util.get_env(args.env_name)
        with tf.Session():
            tf_util.initialize()
            expert_data = ExpertPolicy(env, args).run_expert()
    return expert_data
示例#16
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--expert_policy_file', type=str, default="experts/Hopper-v1.pkl")
    parser.add_argument('--envname', type=str, default="Hopper-v1")
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=1000,
                        help='Number of expert roll outs')
    parser.add_argument('--out_file', type=str, help='save expert data to file')
    args = vars(parser.parse_args())

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args['expert_policy_file'])
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        env = gym.make(args['envname'])
        max_steps = args['max_timesteps'] or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args['num_rollouts']):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args['render']:
                    env.render()
                if steps % 100 == 0:
                    print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}

        if args['out_file'] is not None:
            pickle.dump(expert_data, open(args['out_file'], 'wb'))
示例#17
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                # obs: object, an environment-specific object representing the observation of the environment
                # r: float, reward obtained from action
                # done: boolean, whether it's time to reset the environment
                # _, info: dict, diagnostic information
                obs, r, done, _ = env.step(action) 
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}
示例#18
0
def label_observations(observations, policy_fn):
    with tf.Session():
        tf_util.initialize()

        actions = []

        for obs in observations:
            action = policy_fn(obs[None, :])
            actions.append(action)

    return np.array(actions)
示例#19
0
def load_model(envname):
    env = gym.make(envname)
    # Gather expert experiences
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)):
        with tf.device('/gpu:0'):
            tf_util.initialize()
            max_steps = max_timesteps or env.spec.timestep_limit
            obs = env.reset()
            exp_action = expert(obs[None, :])[0]
            model = build_model([obs], [action])
    return model
示例#20
0
def view_expert(policy, data_path):

    policy_fn = policy
    their_data_path = data_path

    with tf.Session():
        tf_util.initialize()

        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        steps_numbers = []

        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])

                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0:
                    print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break

            steps_numbers.append(steps)
            returns.append(totalr)

        observations = np.array(observations)
        actions = np.array(actions)

        global observations_shape, actions_shape
        observations_shape = observations.shape[1]
        actions_shape = actions.shape[2]
        expert_data = {
            'observations': observations,
            'actions': actions,
            'returns': np.array(returns),
            'steps': np.array(steps_numbers)
        }
        # print('expert_data', expert_data)
    pickle.dump(expert_data, open(their_data_path, 'wb'))
示例#21
0
def run_exp_on_ours(env_name, obs, render=False):
    with tf.Session():
        tf_util.initialize()
        actions = []
        policy_fn = load_policy_fn(env_name)
        print("Running expert policy on our observations")

        for ob in tqdm.tqdm(obs):
            action = policy_fn(ob[None, :])
            actions.append(action)
        return actions
示例#22
0
def main():
    # Load the expert policy. ['GaussianPolicy', 'nonlin_type']
    expert_policy_file = 'experts/Humanoid-v2.pkl'

    envname = 'Humanoid-v2'

    num_rollouts = 20
    max_timesteps = None
    render = True

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(envname)
        max_steps = max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        # get data. ['observations', 'actions']
        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}
		# storing data
        with open(os.path.join('expert_data', envname + '.pkl'), 'wb') as f:
            pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
def test_model(behavior_clone,
               envname,
               render=True,
               max_timesteps=1000,
               num_rollouts=20,
               get_expert_data=False):
    with tf.Session():
        tf_util.initialize()
        behavior_clone.model = load_model(behavior_clone.get_file_name())
        return helper(behavior_clone.predict, envname, render, max_timesteps,
                      num_rollouts, get_expert_data)
示例#24
0
文件: eval.py 项目: Theling/OptionGAN
def one_rollout(sess, env, file_):

    max_steps = env.spec.timestep_limit
    print(max_steps)
    policy = GaussianMLPPolicy(env,
                               hidden_sizes=args.policy_size,
                               activation=tf.nn.tanh)
    tf_util.initialize()
    policy_params = joblib.load(file_)
    # print([x.name for x in policy.get_params()])
    policy.set_param_values(sess, policy_params)

    ret = []

    policy_fn = lambda x: policy.act(x, sess, eval=True)[0]

    for i in range(num_rollouts):
        print('iter', i)
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        returns = []
        rewards = []
        observations = []
        actions = []
        while not done:
            action = policy_fn(obs)
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            # print(done)
            rewards.append(r)
            totalr += r
            steps += 1
            if True:
                env.render()
            if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)
        ret.append({
            'observations': np.array(observations),
            'actions': np.array(actions),
            'rewards': np.array(rewards),
            'mean_return': np.mean(returns),
            'std_return': np.std(returns)
        })
    returns = [ele['mean_return'] for ele in ret]
    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))
    return ret
示例#25
0
def gather_expert_data(expert_policy_file, envname, render, num_rollouts):

    policy_fn = load_policy.load_policy(expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(envname)
        max_steps = env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        steps_numbers = []

        for i in range(num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                # if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break

            steps_numbers.append(steps)
            returns.append(totalr)

        # print('returns', returns)
        # print('mean return', np.mean(returns))
        # print('std of return', np.std(returns))

        # 在这里得到专家数据
        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions),
            'returns': np.array(returns),
            'steps': np.array(steps_numbers)
        }

        return expert_data, np.mean(returns), np.std(returns)
示例#26
0
def generate_expert_data(envname,
                         max_timesteps,
                         expert_policy_file,
                         num_rollouts,
                         save=True):
    with tf.Session():
        tf_util.initialize()
        import gym
        env = gym.make(envname)
        max_steps = max_timesteps or env.spec.timestep_limit

        print('loading and building expert policy')
        policy_fn = load_policy.load_policy(expert_policy_file)
        print('loaded and built')

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                #print("action", action)
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

    #print('returns', returns)
    #print('mean return', np.mean(returns))
    #print('std of return', np.std(returns))

    expert_data = {
        'observations': np.array(observations),
        'actions': np.array(actions),
        'returns': np.array(returns)
    }

    if save:
        f = open("experts/" + envname + '.meta', 'wb')
        pickle.dump(expert_data, f)

    return expert_data
示例#27
0
def generate_rollout_data(expert_policy_file, env_name, num_rollouts, render, output_dir=None, save=False, max_timesteps=None):
    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        env = gym.make(env_name)
        max_steps = max_timesteps or env.spec.timestep_limit

        if save:
            expert_results_dir = os.path.join(os.getcwd(), 'results', env_name, 'expert')
            env = wrappers.Monitor(env, expert_results_dir, force=True)
        
        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions),
                       'mean_return': np.mean(returns),
                        'std_return': np.std(returns)}

        if output_dir is not 'None':
            output_dir = os.path.join(os.getcwd(), output_dir)
            filename = '{}_data_{}_rollouts.pkl'.format(env_name, num_rollouts)
            with open(output_dir + '/' + filename,'wb') as f:
                 pickle.dump(expert_data, f)
示例#28
0
def run_expert(expert_policy_file,
               envname,
               in_jupyter=False,
               render='store_true',
               max_timesteps=None,
               num_rollouts=20):
    policy_fn = load_policy.load_policy(expert_policy_file)
    print('---------------training ' + envname + '---------------')
    with tf.Session():
        tf_util.initialize()

        env = gym.make(envname)
        max_steps = max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            obs = env.reset()
            done = False
            totalr = 0.
            frames = []
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    if in_jupyter:
                        frames.append(env.render(mode='rgb_array'))
                    else:
                        env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            if render and in_jupyter:
                env.render(close=True)
                gym_util.display_frames_as_gif(frames)
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions)
        }
        save_expert_data(envname, expert_data, returns)
    print('--------------------------------------------\n')
    def setup(self):
        """Setup environment and expert. Use as context."""
        import tf_util
        import load_tf_policy

        self.expert = load_tf_policy.load_policy(
            data_path("experts/" + self.envname + "-v1.pkl"))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config):
            tf_util.initialize()
            yield self
示例#30
0
def labeling(args, observations, policy_fn):
    print("Begin Labeling")
    with tf.Session():
        tf_util.initialize()
        actions = []
        for ob in observations:
            action = policy_fn(ob[None,:])
            actions.append(action)

    assert len(observations) == len(actions)
    new_dataset = tf.data.Dataset.from_tensor_slices((np.array(observations), np.array(actions).squeeze()))
    print('End Labeling')
    return new_dataset
示例#31
0
def main():
    """ Entry point for the program.
    """
    args = get_args()
    # Build inference graph
    # Build training graph
    with tf.Session() as sess:
        tf_util.initialize()
        expert_data = run_expert(args)
        next_data = gen_input_graph(expert_data)
        for i in range(10):
            print(sess.run(next_data)[0].shape)
            print(sess.run(next_data)[1].shape)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('expert_policy_data', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    parser.add_argument('--num_epochs', type=int, default=50, help='Number of epochs for training')
    #need number of epoch
    args = parser.parse_args()
    
    print('loading expert policy data for training')
    with open(args.expert_policy_data, 'rb') as handle:
        expert_data = pickle.load(handle)
    
    #train the network
    torch.manual_seed(25)
    o_expert=expert_data['observations']
    (N,N_step,N_obs)=o_expert.shape
    a_expert=expert_data['actions']
    (N,N_step,_,N_action)=a_expert.shape
    import gym
    env = gym.make(args.envname)
    max_steps = args.max_timesteps or env.spec.timestep_limit
    net=CNN(N_obs, N_action)
    
    #todo:initilize network parameters
    net.apply(init_weights)

    import torch.optim as optim
    optimizer=optim.Adam(net.parameters(),lr=1e-3, weight_decay=5e-9)
    criterion=nn.MSELoss()
    loss_history=[]
    reward_mean_history=[]
    reward_std_history=[]
    for j in range(args.num_epochs):
        print("epoch %i"%j)
        net.train()
        (N,N_step,N_obs)=o_expert.shape
        (N,N_step,_,N_action)=a_expert.shape
        for k in range(max_steps):
            optimizer.zero_grad()
            index=k
            o=Variable(torch.from_numpy(o_expert[:,index,:]).reshape(N,1,N_obs))
            o=o.float()
            a_out=net.forward(o)
            a_label=torch.from_numpy(a_expert[:,index,:].reshape(N,N_action,1))
            loss=criterion(a_out.float(), a_label.float())
            loss.backward()
            optimizer.step()
        print("No DAGGER")
        print(loss/N)
        loss_history.append(loss/N)
        
        #test the network
        with tf.Session():
            tf_util.initialize()

            import gym
            env = gym.make(args.envname)
            max_steps = args.max_timesteps or env.spec.timestep_limit
            net.eval()

            r_new=[]
            for i in range (int(args.num_rollouts)//4):
                totalr=0
                obs=env.reset()
                done=False
                steps=0
                while not done:
                    obs=Variable(torch.Tensor(obs).reshape(1,1,N_obs))
                    action_new=net.forward(obs).detach().numpy()
                    obs,r,done,_=env.step(action_new.reshape(N_action))
                    totalr+=r
                    steps+=1
                    if steps >= max_steps:
                        break
                r_new.append(totalr)
            u=np.average(np.array(r_new))
            sigma=np.std(np.array(r_new))
            reward_mean_history.append(u)
            reward_std_history.append(sigma)
            print('current reward mean', u)
            print('current reward std', sigma)
    fig0=plt.figure(0)
    plt.plot(loss_history, '-o')
    plt.xlabel('iteration')
    plt.ylabel('loss')
    fig0.savefig('/Users/joker/imitation_learning/hopper.png')
    
    reward_mean_history=np.array(reward_mean_history)
    reward_std_history=np.array(reward_std_history)
    #print(reward_mean_history.shape)
    #print(reward_std_history.shape)
    print('mean:', reward_mean_history)
    print('std:', reward_std_history)
    
    fig1=plt.figure(1)
    plt.errorbar(np.arange(args.num_epochs),reward_mean_history, reward_std_history, marker="s", mfc='blue', mec='yellow')
    fig1.savefig('/Users/joker/imitation_learning/hopper_reward.png')
示例#33
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    parser.add_argument('--num_epochs', type=int, default=5, help='Number of epochs for training')
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            this_obs=[]
            this_act=[]
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                this_obs.append(obs)
                this_act.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)
            observations.append(this_obs)
            actions.append(this_act)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
        print( (np.array(observations)).shape)
        print( (np.array(actions)).shape)
        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}
    
    #train the network
    o_expert=expert_data['observations']
    (N,N_step,N_obs)=o_expert.shape
    a_expert=expert_data['actions']
    (N,N_step,_,N_action)=a_expert.shape
    net=CNN(N_obs, N_action)
    
    #todo:initilize network parameters
    net.apply(init_weights)

    import torch.optim as optim
    optimizer=optim.Adam(net.parameters(),lr=1e-3, weight_decay=5e-12)
    criterion=nn.MSELoss()
    loss_history=[]
    for j in range(args.num_epochs):
        print("epoch %i"%j)
        (N,N_step,N_obs)=o_expert.shape
        (N,N_step,_,N_action)=a_expert.shape
        for k in range(max_steps):
            index=k
            o=Variable(torch.from_numpy(o_expert[:,index,:]).reshape(N,1,N_obs))
            o=o.float()
            a_out=net.forward(o)
            a_label=torch.from_numpy(a_expert[:,index,:].reshape(N,N_action,1))
            loss=criterion(a_out.float(), a_label.float())
            loss.backward()
            loss_history.append(loss)
            optimizer.step()
        print("before DAGGER")
        print(loss) 

        #implement dagger
        with tf.Session():
            tf_util.initialize()

            import gym
            env = gym.make(args.envname)
            max_steps = args.max_timesteps or env.spec.timestep_limit

            o_new_expert=[]
            a_new_expert=[]
            for i in range (int(args.num_rollouts)//2):
                this_o_new=[]
                this_a_new=[]
                obs=env.reset()
                done=False
                steps=0
                while not done:
                    action = policy_fn(obs[None, :])
                    this_o_new.append(obs)
                    this_a_new.append(action)
                    obs=Variable(torch.Tensor(obs).reshape(1,1,N_obs))
                    action_new=net.forward(obs).detach().numpy()
                    obs,r,done,_=env.step(action_new.reshape(17))
                    steps+=1
                    if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                    if steps >= max_steps:
                        break

                #if terminates early, we pad 0 to both observation and actions lists
                while steps<max_steps:
                    steps+=1
                    this_o_new.append(np.zeros(N_obs))
                    this_a_new.append(np.zeros((1,N_action)))
                o_new_expert.append(this_o_new)
                a_new_expert.append(this_a_new)
            o_new=np.array(o_new_expert)
            a_new=np.array(a_new_expert)
            o_expert=np.concatenate((o_expert,o_new), axis=0)
            a_expert=np.concatenate((a_expert,a_new), axis=0)
        
    plt.plot(loss_history, '-o')
    plt.xlabel('iteration')
    plt.ylabel('loss')
    plt.savefig('/Users/joker/imitation_learning/humanoid_dagger.png')
    plt.show()
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('expert_policy_data', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    parser.add_argument('--num_epochs', type=int, default=50, help='Number of epochs for training')
    #need number of epoch
    args = parser.parse_args()
    
    print('loading expert policy data for training')
    with open(args.expert_policy_data, 'rb') as handle:
        expert_data = pickle.load(handle)
    
    #train the network
    torch.manual_seed(25)
    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')
    o_expert=torch.Tensor(expert_data['observations'])
    (N,N_obs)=o_expert.size()
    a_expert=torch.Tensor(expert_data['actions'])
    (N,_,N_action)=a_expert.size()
    a_expert=a_expert.view(N,N_action)
    import gym
    env = gym.make(args.envname)
    max_steps = args.max_timesteps or env.spec.timestep_limit
    net=CNN(N_obs, N_action)
    
    #todo:initilize network parameters
    net.apply(init_weights)

    import torch.optim as optim
    optimizer=optim.Adam(net.parameters(),lr=1e-4, weight_decay=5e-9)
    criterion=nn.MSELoss()
    loss_history=[]
    reward_mean_history=[]
    reward_std_history=[]
    for j in range(args.num_epochs):
        print("epoch %i"%j)
        net.train()
        N=o_expert.shape[0]
        print(N)
        train_set=data_utils.TensorDataset(o_expert, a_expert)
        train_loader=data_utils.DataLoader(dataset=train_set,batch_size=BATCH_SIZE,shuffle=True)
        epoch_train_loss=0
        for i, (X_train, y_train) in enumerate(train_loader):
            net.zero_grad()
            y_pred=net.forward(X_train)
            loss=criterion(y_pred,y_train)
            loss.backward()
            optimizer.step()
            epoch_train_loss+=loss.item()/N
        print("Before DAGGER")
        print(epoch_train_loss)
        loss_history.append(epoch_train_loss)
        
        #implement dagger
        with tf.Session():
            tf_util.initialize()

            import gym
            env = gym.make(args.envname)
            max_steps = args.max_timesteps or env.spec.timestep_limit
            net.eval()

            o_new_expert=[]
            a_new_expert=[]
            reward_new=[]
            for i in range (int(args.num_rollouts)//4):
                obs=env.reset()
                done=False
                steps=0
                totalr=0
                while not done:
                    action = policy_fn(obs[None, :])
                    o_new_expert.append(obs)
                    a_new_expert.append(action)
                    obs=Variable(torch.Tensor(obs).reshape(1,N_obs))
                    action_new=net.forward(obs).detach().numpy()
                    obs,r,done,_=env.step(action_new.reshape(N_action))
                    totalr+=r
                    steps+=1
                    if steps >= max_steps:
                        break
                reward_new.append(totalr)
            o_new=torch.Tensor(np.array(o_new_expert))
            a_new=torch.Tensor(np.array(a_new_expert).reshape(-1,N_action))
            o_expert=torch.cat((o_expert,o_new), 0)
            a_expert=torch.cat((a_expert,a_new), 0)
            reward_new=np.array(reward_new)
            #print(reward_new.shape)
            u=np.average(reward_new)
            sigma=np.std(reward_new)
            print('current reward mean', u)
            print('current reward std', sigma)
            reward_mean_history.append(u)
            reward_std_history.append(sigma)
    fig0=plt.figure(0)
    plt.plot(loss_history, '-o')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    fig0.savefig('/Users/joker/imitation_learning/hopper_basic_DAGGER.png')
    
    reward_mean_history=np.array(reward_mean_history)
    reward_std_history=np.array(reward_std_history)
    #print(reward_mean_history.shape)
    #print(reward_std_history.shape)
    print('mean:', reward_mean_history)
    print('std:', reward_std_history)
    
    fig1=plt.figure(1)
    plt.errorbar(np.arange(args.num_epochs),reward_mean_history, reward_std_history, marker="s", mfc='blue', mec='yellow')
    fig1.savefig('/Users/joker/imitation_learning/hopper__basic_DAGGERreward.png')