def load_policy(path):
    with open(path, "rb") as f:
        checkpoint = pickle.load(f)

    variant = checkpoint["variant"]
    env_params = variant["environment_params"]["training"]
    alice_params = variant["alice"]
    bob_params = variant["bob"]
    num_skills = alice_params["algorithm_params"]["discriminator_params"][
        "num_skills"]

    # bob policy
    env = get_environment_from_params(env_params)
    bob_policy = get_policy_from_variant(bob_params, env)
    bob_policy.set_weights(checkpoint["policy_weights"]["bob"])
    bob_policy._deterministic = True

    # alice policy
    env._observation_space.spaces["diayn"] = gym.spaces.Box(
        low=np.repeat(0, num_skills),
        high=np.repeat(1, num_skills),
    )
    env.observation_keys += ("diayn", )

    alice_policy = get_policy_from_variant(alice_params, env)
    alice_policy.set_weights(checkpoint["policy_weights"]["alice"])
    alice_policy._deterministic = True

    return env, alice_policy, bob_policy, num_skills
예제 #2
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        print(variant)
        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(variant,
                                                       training_environment,
                                                       Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
예제 #3
0
def simulate_policy(args):
    gpu_options = tf.GPUOptions(allow_growth=True)
    session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.keras.backend.set_session(session)

    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.json')
    with open(variant_path, 'r') as f:
        variant = json.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    env = picklable['env']
    policy = (get_policy_from_variant(variant, env))
    policy.set_weights(picklable['policy_weights'])

    with policy.set_deterministic(args.deterministic):
        paths = my_rollouts(env=env,
                            policy=policy,
                            path_length=args.max_path_length,
                            n_paths=args.num_rollouts,
                            render_mode=args.render_mode)

    return paths
예제 #4
0
def init_policy():
    session = tf.keras.backend.get_session()
    checkpoint_path = CHECKPOINT_PATH.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    environment_params['n_parallel_envs'] = 1
    evaluation_environment = get_environment_from_params(environment_params)
    policy = get_policy_from_variant(variant, evaluation_environment)
    policy.set_weights(picklable['policy_weights'])

    Qs = get_Q_function_from_variant(variant, evaluation_environment)
    for i, Q in enumerate(Qs):
        Qs[i].load_weights(os.path.join(checkpoint_path, 'Qs_{}'.format(i)))

    return policy, Qs
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.json')
    with open(variant_path, 'r') as f:
        variant = json.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    env = picklable['env']
    policy = (get_policy_from_variant(variant, env, Qs=[None]))
    policy.set_weights(picklable['policy_weights'])

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(env,
                         policy,
                         path_length=args.max_path_length,
                         n_paths=args.num_rollouts,
                         render_mode=args.render_mode)

    if args.render_mode != 'human':
        from pprint import pprint
        import pdb
        pdb.set_trace()
        pass

    return paths
예제 #6
0
파일: main.py 프로젝트: Haffon/synergyDRL
    def _build(self):
        variant = copy.deepcopy(self._variant)

        env = self.env = get_environment_from_variant(variant)
        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, env))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant, env)
        #policy = self.policy = get_policy_from_variant(variant, env, Qs)
        policy = self.policy = get_policy_from_variant(variant, env)

        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', env))

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            env=self.env,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session)

        print([
            x for x in tf.get_default_graph().get_operations()
            if x.type == "Placeholder"
        ])
        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
예제 #7
0
    def _build(self):
        variant = copy.deepcopy(self._variant)

        env = self.env = get_environment_from_variant(variant)
        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, env))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant, env)
        policy = self.policy = get_policy_from_variant(variant, env, Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', env))

        self.algorithm = get_algorithm_from_variant(
            variant=variant,
            env=env,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session,
        )

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
예제 #8
0
    def _restore(self, checkpoint_dir):
        assert isinstance(checkpoint_dir, str), checkpoint_dir

        checkpoint_dir = checkpoint_dir.rstrip('/')

        with self._session.as_default():
            pickle_path = self._pickle_path(checkpoint_dir)
            with open(pickle_path, 'rb') as f:
                pickleable = pickle.load(f)

        variant_diff = DeepDiff(self._variant, pickleable['variant'])

        if variant_diff:
            print("Your current variant is different from the checkpointed"
                  " variable. Please make sure that the differences are"
                  " expected. Differences:")
            pprint(variant_diff)

            if not strtobool(
                    input("Continue despite the variant differences?\n")):
                sys.exit(0)

        env = self.env = pickleable['env']
        replay_pool = self.replay_pool = pickleable['replay_pool']
        sampler = self.sampler = pickleable['sampler']
        Qs = self.Qs = pickleable['Qs']
        # policy = self.policy = pickleable['policy']
        policy = self.policy = (get_policy_from_variant(
            self._variant, env, Qs))
        self.policy.set_weights(pickleable['policy_weights'])
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', env))

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            env=self.env,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session)
        self.algorithm.__setstate__(pickleable['algorithm'].__getstate__())

        tf_checkpoint = self._get_tf_checkpoint()
        status = tf_checkpoint.restore(
            tf.train.latest_checkpoint(
                os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0]))

        status.assert_consumed().run_restore_ops(self._session)
        initialize_tf_variables(self._session, only_uninitialized=True)

        # TODO(hartikainen): target Qs should either be checkpointed
        # or pickled.
        for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets):
            Q_target.set_weights(Q.get_weights())

        self._built = True
예제 #9
0
    def _restore(self, checkpoint_dir):
        assert isinstance(checkpoint_dir, str), checkpoint_dir

        checkpoint_dir = checkpoint_dir.rstrip('/')

        with self._session.as_default():
            pickle_path = self._pickle_path(checkpoint_dir)
            with open(pickle_path, 'rb') as f:
                picklable = pickle.load(f)

        training_environment = self.training_environment = picklable[
            'training_environment']
        evaluation_environment = self.evaluation_environment = picklable[
            'evaluation_environment']

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            self._variant, training_environment))

        if self._variant['run_params'].get('checkpoint_replay_pool', False):
            self._restore_replay_pool(checkpoint_dir)

        sampler = self.sampler = picklable['sampler']
        Qs = self.Qs = get_Q_function_from_variant(self._variant,
                                                   training_environment)
        self._restore_value_functions(checkpoint_dir)
        policy = self.policy = (get_policy_from_variant(
            self._variant, training_environment))
        self.policy.set_weights(picklable['policy_weights'])
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy_from_params(self._variant['exploration_policy_params'],
                                   training_environment))

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session)
        self.algorithm.__setstate__(picklable['algorithm'].__getstate__())

        tf_checkpoint = self._get_tf_checkpoint()
        status = tf_checkpoint.restore(
            tf.train.latest_checkpoint(
                os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0]))

        status.assert_consumed().run_restore_ops(self._session)
        initialize_tf_variables(self._session, only_uninitialized=True)

        # TODO(hartikainen): target Qs should either be checkpointed or pickled.
        for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets):
            Q_target.set_weights(Q.get_weights())

        self._built = True
예제 #10
0
def run_experiment(variant, reporter):
    env = get_environment('gym', 'MultiGoal', 'Default', {
        'actuation_cost_coeff': 1,
        'distance_cost_coeff': 0.1,
        'goal_reward': 1,
        'init_sigma': 0.1,
    })

    pool = SimpleReplayPool(
        observation_space=env.observation_space,
        action_space=env.action_space,
        max_size=1e6)

    sampler = SimpleSampler(
        max_path_length=30, min_pool_size=100, batch_size=64)

    Qs = get_Q_function_from_variant(variant, env)
    policy = get_policy_from_variant(variant, env, Qs)
    plotter = QFPolicyPlotter(
        Q=Qs[0],
        policy=policy,
        obs_lst=np.array(((-2.5, 0.0),
                          (0.0, 0.0),
                          (2.5, 2.5),
                          (-2.5, -2.5))),
        default_action=(np.nan, np.nan),
        n_samples=100)

    algorithm = SAC(
        sampler=sampler,
        reparameterize=True,
        epoch_length=100,
        n_epochs=1000,
        n_train_repeat=1,
        eval_render_mode=None,
        eval_n_episodes=10,
        eval_deterministic=False,

        env=env,
        policy=policy,
        initial_exploration_policy=None,
        pool=pool,
        Qs=Qs,
        plotter=plotter,

        lr=3e-4,
        target_entropy=-2.0,
        discount=0.99,
        tau=1e-4,

        save_full_state=True,
    )

    initialize_tf_variables(algorithm._session, only_uninitialized=True)

    for train_result in algorithm.train():
        reporter(**train_result)
예제 #11
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    evaluation_environment.seed(variant['run_params']['seed'])

    if args.record_video:
        video_dir = os.path.join(experiment_path, 'test-video')
        evaluation_environment._env = wrappers.Monitor(
            evaluation_environment._env, video_dir, force=True)

    policy = (get_policy_from_variant(variant, evaluation_environment))
    policy.set_weights(picklable['policy_weights'])

    render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_kwargs=render_kwargs)

        if not args.record_video:
            evaluation_metrics = evaluate_rollouts(paths,
                                                   evaluation_environment)
            evaluation_file_path = os.path.join(experiment_path,
                                                'final_eval.csv')
            with open(evaluation_file_path, 'w') as f:
                w = csv.DictWriter(f, evaluation_metrics.keys())
                w.writeheader()
                w.writerow(evaluation_metrics)

    if args.render_kwargs.get('mode') == 'rgb_array':
        fps = 1 // getattr(evaluation_environment, 'dt', 1 / 30)
        for i, path in enumerate(paths):
            video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
            video_save_path = os.path.join(video_save_dir, f'episode_{i}.mp4')
            save_video(path['images'], video_save_path, fps=fps)

    return paths
예제 #12
0
    def _build(self):
        """
        called by tune to build algorithm 
        """
        variant = copy.deepcopy(self._variant)

        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        mjc_model_environment = self.mjc_model_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(variant,
                                                       training_environment,
                                                       Qs, self._session)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        #### get termination function
        domain = environment_params['training']['domain']
        static_fns = mbpo.static[domain.lower()]
        ####

        #### build algorithm
        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            mjc_model_environment=mjc_model_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            static_fns=static_fns,
            sampler=sampler,
            session=self._session)

        initialize_tf_variables(self._session, only_uninitialized=True)

        # add graph since ray doesn't seem to automatically add that
        graph_writer = tf.summary.FileWriter(self.logdir, self._session.graph)
        graph_writer.flush()
        graph_writer.close()

        #### finalize graph
        # tf.get_default_graph().finalize() ### good for debugging, but interferes with Qs on SAC
        self._built = True
예제 #13
0
def load_policy_and_environment(picklable, variant):
    environment_params = (variant['environment_params']['training']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])

    environment = get_environment_from_params(environment_params)

    policy = get_policy_from_variant(variant, environment)
    policy.set_weights(picklable['policy_weights'])

    return policy, environment
예제 #14
0
    def _build(self):
        variant = copy.deepcopy(self._variant)

        training_environment = self.training_environment = (
            get_goal_example_environment_from_variant(variant))
        evaluation_environment = self.evaluation_environment = (
            get_goal_example_environment_from_variant(variant))
        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        # 创建网络 Dense :inputs:[state,action] outputs:size=1
        Qs = self.Qs = get_Q_function_from_variant(variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(variant,
                                                       training_environment,
                                                       Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        algorithm_kwargs = {
            'variant': self._variant,
            'training_environment': self.training_environment,
            'evaluation_environment': self.evaluation_environment,
            'policy': policy,
            'initial_exploration_policy': initial_exploration_policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler,
            'session': self._session,
        }

        if self._variant['algorithm_params']['type'] in [
                'SACClassifier', 'RAQ', 'VICE', 'VICEGAN', 'VICERAQ'
        ]:
            reward_classifier = self.reward_classifier \
                = get_reward_classifier_from_variant(self._variant, training_environment)
            algorithm_kwargs['classifier'] = reward_classifier

            goal_examples_train, goal_examples_validation = \
                get_goal_example_from_variant(variant)
            algorithm_kwargs['goal_examples'] = goal_examples_train
            algorithm_kwargs['goal_examples_validation'] = \
                goal_examples_validation

        self.algorithm = get_algorithm_from_variant(**algorithm_kwargs)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
예제 #15
0
def run_experiment(variant, reporter):
    training_environment = (
        get_environment('gym', 'MultiGoal', 'Default-v0', {
            'actuation_cost_coeff': 30,
            'distance_cost_coeff': 1,
            'goal_reward': 10,
            'init_sigma': 0.1,
        }))
    evaluation_environment = training_environment.copy()

    pool = SimpleReplayPool(
        environment=training_environment,
        max_size=1e6)

    sampler = SimpleSampler(max_path_length=30)

    Qs = get_Q_function_from_variant(variant, training_environment)
    policy = get_policy_from_variant(variant, training_environment)
    plotter = QFPolicyPlotter(
        Q=Qs[0],
        policy=policy,
        obs_lst=np.array(((-2.5, 0.0),
                          (0.0, 0.0),
                          (2.5, 2.5),
                          (-2.5, -2.5))),
        default_action=(np.nan, np.nan),
        n_samples=100)

    algorithm = get_algorithm_from_variant(
        variant=variant,
        training_environment=training_environment,
        evaluation_environment=evaluation_environment,
        policy=policy,
        Qs=Qs,
        pool=pool,
        sampler=sampler,
        min_pool_size=100,
        batch_size=46,
        plotter=plotter,
    )

    initialize_tf_variables(algorithm._session, only_uninitialized=True)

    for train_result in algorithm.train():
        reporter(**train_result)
예제 #16
0
def get_policy(checkpoint_path):
    checkpoint_path = checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.json')
    with open(variant_path, 'r') as f:
        variant = json.load(f)

    environment_params = (
        variant['environment_params']['evaluation']
        if 'evaluation' in variant['environment_params']
        else variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (get_policy_from_variant(variant, evaluation_environment, Qs=[None]))
    training_environment = get_environment_from_params_custom(environment_params)

    return policy, training_environment
예제 #17
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.json')
    with open(variant_path, 'r') as f:
        variant = json.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (get_policy_from_variant(variant,
                                      evaluation_environment,
                                      Qs=[None]))
    policy.set_weights(picklable['policy_weights'])

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_mode=args.render_mode)

    #### print rewards
    rewards = [path['rewards'].sum() for path in paths]
    print('Rewards: {}'.format(rewards))
    print('Mean: {}'.format(np.mean(rewards)))
    ####

    if args.render_mode != 'human':
        from pprint import pprint
        import pdb
        pdb.set_trace()
        pass

    return paths
예제 #18
0
    def _build(self):
        variant = copy.deepcopy(self._variant)

        environment_params = variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params
            else training_environment)
        
        seed = variant['run_params']['seed']
        
        training_environment.seed(seed)
        
        # Set a different seed for the evaluation env
        # to ensure the policy is not just memorizing action sequences for seen initial states
        evaluation_environment.seed(seed + 10)

        replay_pool = self.replay_pool = (
            get_replay_pool_from_variant(variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(
            variant, training_environment)
        policy = self.policy = get_policy_from_variant(
            variant, training_environment, Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            sampler=sampler,
            session=self._session)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
예제 #19
0
    def build(self):
        environment_params = self.variant['environment_params']
        training_environment = self.training_environment = (
            get_environment_from_params(environment_params['training']))
        evaluation_environment = self.evaluation_environment = (
            get_environment_from_params(environment_params['evaluation'])
            if 'evaluation' in environment_params else training_environment)

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            self.variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(self.variant)
        Qs = self.Qs = get_Q_function_from_variant(self.variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(self.variant,
                                                       training_environment,
                                                       Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        #### get termination function
        domain = environment_params['training']['domain']
        static_fns = static[domain.lower()]
        ####

        log_path = './log/%s' % (self.variant['algorithm_params']['domain'])
        if (not os.path.exists(log_path)):
            os.makedirs(log_path)

        self.algorithm = get_algorithm_from_variant(
            variant=self.variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            static_fns=static_fns,
            sampler=sampler,
            session=self._session,
            log_file='./log/%s/%d.log' %
            (self.variant['algorithm_params']['domain'], time.time()))

        initialize_tf_variables(self._session, only_uninitialized=True)
예제 #20
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    environment_params = (
        variant['environment_params']['evaluation']
        if 'evaluation' in variant['environment_params']
        else variant['environment_params']['training'])
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (
        get_policy_from_variant(variant, evaluation_environment))
    policy.set_weights(picklable['policy_weights'])

    render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_kwargs=render_kwargs)

    if args.render_kwargs.get('mode') == 'rgb_array':
        for i, path in enumerate(paths):
            video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
            video_save_path = os.path.join(video_save_dir, f'episode_{i}.avi')
            save_video(path['images'], video_save_path)

    return paths
예제 #21
0
    def _build(self):
        variant = copy.deepcopy(self._variant)
        print(variant.keys())
        env = self.env = get_environment_from_params(
            variant['environment_params']['training'])
        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, env))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant, env)
        policy = self.policy = get_policy_from_variant(variant, env, Qs)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', env))

        algorithm_kwargs = {
            'variant': self._variant,
            'env': self.env,
            'policy': policy,
            'initial_exploration_policy': initial_exploration_policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler,
            'session': self._session,
        }

        if self._variant['algorithm_params']['type'] in CLASSIFIER_RL_ALGS:
            reward_classifier = self.reward_classifier \
                = get_reward_classifier_from_variant(self._variant, env)
            algorithm_kwargs['classifier'] = reward_classifier

            goal_examples_train, goal_examples_validation = \
                get_goal_example_from_variant(variant)
            algorithm_kwargs['goal_examples'] = goal_examples_train
            algorithm_kwargs['goal_examples_validation'] = \
                goal_examples_validation

        self.algorithm = get_algorithm_from_variant(**algorithm_kwargs)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
예제 #22
0
def simulate_policy(args):
    gpu_options = tf.GPUOptions(allow_growth=True)
    session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.keras.backend.set_session(session)
 
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.json')
    with open(variant_path, 'r') as f:
        variant = json.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    env = picklable['env']
    policy = (
        get_policy_from_variant(variant, env))
    policy.set_weights(picklable['policy_weights'])
    #env = wrappers.Monitor(env, '/home/jzchai/PycharmProjects/softlearning/examples/plotting/Synergy', force=True)

    with policy.set_deterministic	(args.deterministic):
        paths = rollouts(env=env,
                         policy=policy,
                         path_length=args.max_path_length,
                         n_paths=args.num_rollouts,
                         render_mode=args.render_mode)

    if args.render_mode != 'human':
        from pprint import pprint; import pdb; pdb.set_trace()
        pass

    return paths
예제 #23
0
파일: main.py 프로젝트: xionghuichen/mopo
def main():
    import sys
    example_args = get_parser().parse_args(sys.argv[1:])

    variant_spec = get_variant_spec(example_args)
    command_line_args = example_args
    print('vriant spec: {}'.format(variant_spec))
    params = variant_spec.get('algorithm_params')
    local_dir = os.path.join(params.get('log_dir'), params.get('domain'))

    resources_per_trial = _normalize_trial_resources(
        command_line_args.resources_per_trial, command_line_args.trial_cpus,
        command_line_args.trial_gpus, command_line_args.trial_extra_cpus,
        command_line_args.trial_extra_gpus)
    experiment_id = params.get('exp_name')

    #### add pool_load_max_size to experiment_id
    if 'pool_load_max_size' in variant_spec['algorithm_params']['kwargs']:
        max_size = variant_spec['algorithm_params']['kwargs'][
            'pool_load_max_size']
        experiment_id = '{}_{}e3'.format(experiment_id, int(max_size / 1000))
    ####

    variant_spec = add_command_line_args_to_variant_spec(
        variant_spec, command_line_args)

    if command_line_args.video_save_frequency is not None:
        assert 'algorithm_params' in variant_spec
        variant_spec['algorithm_params']['kwargs']['video_save_frequency'] = (
            command_line_args.video_save_frequency)

    variant = variant_spec
    # init
    set_seed(variant['run_params']['seed'])
    gpu_options = tf.GPUOptions(allow_growth=True)
    session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.keras.backend.set_session(session)

    # build
    variant = copy.deepcopy(variant)

    tester.set_hyper_param(**variant)
    tester.add_record_param(['run_params.seed', 'info'])
    tester.configure(task_name='policy_learn',
                     private_config_path=os.path.join(get_package_path(),
                                                      'rla_config.yaml'),
                     run_file='main.py',
                     log_root=get_package_path())
    tester.log_files_gen()
    tester.print_args()

    environment_params = variant['environment_params']
    training_environment = (get_environment_from_params(
        environment_params['training']))
    evaluation_environment = (get_environment_from_params(
        environment_params['evaluation'](variant)) if 'evaluation'
                              in environment_params else training_environment)

    replay_pool = (get_replay_pool_from_variant(variant, training_environment))
    sampler = get_sampler_from_variant(variant)
    Qs = get_Q_function_from_variant(variant, training_environment)
    policy = get_policy_from_variant(variant, training_environment, Qs)
    initial_exploration_policy = (get_policy('UniformPolicy',
                                             training_environment))

    #### get termination function
    domain = environment_params['training']['domain']
    static_fns = mopo.static[domain.lower()]
    ####
    print("[ DEBUG ] KWARGS: {}".format(variant['algorithm_params']['kwargs']))

    algorithm = get_algorithm_from_variant(
        variant=variant,
        training_environment=training_environment,
        evaluation_environment=evaluation_environment,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        Qs=Qs,
        pool=replay_pool,
        static_fns=static_fns,
        sampler=sampler,
        session=session)
    print('[ DEBUG ] finish construct model, start training')
    # train
    list(algorithm.train())
예제 #24
0
    def _build(self):
        variant = copy.deepcopy(self._variant)

        #training_environment = self.training_environment = (
        #    get_goal_example_environment_from_variant(
        #        variant['task'], gym_adapter=False))

        training_environment = self.training_environment = (GymAdapter(
            domain=variant['domain'],
            task=variant['task'],
            **variant['env_params']))

        #evaluation_environment = self.evaluation_environment = (
        #    get_goal_example_environment_from_variant(
        #        variant['task_evaluation'], gym_adapter=False))
        evaluation_environment = self.evaluation_environment = (GymAdapter(
            domain=variant['domain'],
            task=variant['task_evaluation'],
            **variant['env_params']))

        # training_environment = self.training_environment = (
        #     flatten_multiworld_env(self.training_environment))
        # evaluation_environment = self.evaluation_environment = (
        #     flatten_multiworld_env(self.evaluation_environment))
        #training_environment = self.training_environment = (
        #        GymAdapter(env=training_environment))
        #evaluation_environment = self.evaluation_environment = (
        #        GymAdapter(env=evaluation_environment))

        # make sure this is her replay pool
        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            variant, training_environment))
        sampler = self.sampler = get_sampler_from_variant(variant)
        Qs = self.Qs = get_Q_function_from_variant(variant,
                                                   training_environment)
        policy = self.policy = get_policy_from_variant(variant,
                                                       training_environment)
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy_from_params(variant['exploration_policy_params'],
                                   training_environment))

        algorithm_kwargs = {
            'variant': self._variant,
            'training_environment': self.training_environment,
            'evaluation_environment': self.evaluation_environment,
            'policy': policy,
            'initial_exploration_policy': initial_exploration_policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler,
            'session': self._session,
        }

        if self._variant['algorithm_params']['type'] in [
                'VICEGoalConditioned', 'VICEGANGoalConditioned'
        ]:
            reward_classifier = self.reward_classifier = (
                get_reward_classifier_from_variant(self._variant,
                                                   training_environment))
            algorithm_kwargs['classifier'] = reward_classifier

            # goal_examples_train, goal_examples_validation = \
            #     get_goal_example_from_variant(variant)
            algorithm_kwargs['goal_examples'] = np.empty((1, 1))
            algorithm_kwargs['goal_examples_validation'] = np.empty((1, 1))

        # RND
        if variant['algorithm_params']['rnd_params']:
            from softlearning.rnd.utils import get_rnd_networks_from_variant
            rnd_networks = get_rnd_networks_from_variant(
                variant, training_environment)
        else:
            rnd_networks = ()
        algorithm_kwargs['rnd_networks'] = rnd_networks

        self.algorithm = get_algorithm_from_variant(**algorithm_kwargs)

        initialize_tf_variables(self._session, only_uninitialized=True)

        self._built = True
예제 #25
0
    def restore_mbpo(self, checkpoint_dir):
        checkpoint_dir = checkpoint_dir.rstrip('/')

        with self._session.as_default():
            pickle_path = self._pickle_path(checkpoint_dir)
            with open(pickle_path, 'rb') as f:
                picklable = pickle.load(f)

        training_environment = self.training_environment = picklable[
            'training_environment']
        evaluation_environment = self.evaluation_environment = picklable[
            'evaluation_environment']
        mjc_model_environment = self.mjc_model_environment = picklable.get(
            'mjc_model_environment', None)

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            self._variant, training_environment))

        if self._variant['run_params'].get('checkpoint_replay_pool', False):
            self._restore_replay_pool(checkpoint_dir)

        sampler = self.sampler = picklable['sampler']
        Qs = self.Qs = picklable['Qs']
        # policy = self.policy = picklable['policy']
        policy = self.policy = (get_policy_from_variant(
            self._variant, training_environment, Qs))
        self.policy.set_weights(picklable['policy_weights'])
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        #### get termination function
        environment_params = self._variant['environment_params']
        domain = environment_params['training']['domain']
        static_fns = mbpo.static[domain.lower()]
        ####

        self.algorithm = get_algorithm_from_variant(
            variant=self._variant,
            training_environment=training_environment,
            evaluation_environment=evaluation_environment,
            mjc_model_environment=mjc_model_environment,
            policy=policy,
            initial_exploration_policy=initial_exploration_policy,
            Qs=Qs,
            pool=replay_pool,
            static_fns=static_fns,
            sampler=sampler,
            session=self._session)
        self.algorithm.__setstate__(picklable['algorithm'].__getstate__())

        tf_checkpoint = self._get_tf_checkpoint()
        status = tf_checkpoint.restore(
            tf.train.latest_checkpoint(
                os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0]))

        status.assert_consumed().run_restore_ops(self._session)
        initialize_tf_variables(self._session, only_uninitialized=True)

        # TODO(hartikainen): target Qs should either be checkpointed or pickled.
        for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets):
            Q_target.set_weights(Q.get_weights())
예제 #26
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    with session.as_default():
        pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
        with open(pickle_path, 'rb') as f:
            picklable = pickle.load(f)

    import ipdb
    ipdb.set_trace()
    environment_params = (variant['environment_params']['evaluation']
                          if 'evaluation' in variant['environment_params'] else
                          variant['environment_params']['training'])
    if args.use_state_estimator:
        environment_params['kwargs'].update({
            'pixel_wrapper_kwargs': {
                'pixels_only': False,
                'normalize': False,
                'render_kwargs': {
                    'width': 32,
                    'height': 32,
                    'camera_id': -1,
                }
            },
            'camera_settings': {
                'azimuth': 180,
                'distance': 0.35,
                'elevation': -55,
                'lookat': (0, 0, 0.03),
            },
        })
        # obs_keys = environment_params['kwargs'].pop('observation_keys')
        # non_object_obs_keys = [obs_key for obs_key in obs_keys if 'object' not in obs_key]
        # non_object_obs_keys.append('pixels')
        # environment_params['kwargs']['observation_keys'] = tuple(non_object_obs_keys)

    # if args.render_mode == 'human':
    #     if 'has_renderer' in environment_params['kwargs'].keys():
    #         environment_params['kwargs']['has_renderer'] = True

    # variant['environment_params']['evaluation']['task'] = 'TurnFreeValve3ResetFree-v0'
    # variant['environment_params']['evaluation']['kwargs']['reset_from_corners'] = True
    #     'reward_keys': (
    #         'object_to_target_position_distance_cost',
    #         'object_to_target_orientation_distance_cost',
    #     ),
    #     'swap_goal_upon_completion': False,
    # }
    evaluation_environment = get_environment_from_params(environment_params)

    policy = (get_policy_from_variant(variant, evaluation_environment))
    policy.set_weights(picklable['policy_weights'])
    dump_path = os.path.join(checkpoint_path, 'policy_params.pkl')
    with open(dump_path, 'wb') as f:
        pickle.dump(picklable['policy_weights'], f)

    render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

    from softlearning.preprocessors.utils import get_state_estimator_preprocessor
    state_estimator = get_state_estimator_preprocessor(
        state_estimator_path=
        '/home/justinvyu/dev/softlearning-vice/softlearning/models/state_estimators/state_estimator_fixed_antialias.h5',
        num_hidden_units=256,
        num_hidden_layers=2)
    sampler_kwargs = {
        'state_estimator': state_estimator,
        'replace_state': True,
    }

    with policy.set_deterministic(args.deterministic):
        paths = rollouts(args.num_rollouts,
                         evaluation_environment,
                         policy,
                         path_length=args.max_path_length,
                         render_kwargs=render_kwargs,
                         sampler_kwargs=sampler_kwargs)

    if args.render_kwargs.get('mode') == 'rgb_array':
        fps = 2 // getattr(evaluation_environment, 'dt', 1 / 30)
        for i, path in enumerate(paths):
            video_save_dir = args.checkpoint_path
            # video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
            video_save_path = os.path.join(video_save_dir, f'episode_{i}.mp4')
            save_video(path['images'], video_save_path, fps=fps)

    return paths
예제 #27
0
def simulate_policy(args):
    session = tf.keras.backend.get_session()
    checkpoint_path = args.checkpoint_path.rstrip('/')
    experiment_path = os.path.dirname(checkpoint_path)

    variant_path = os.path.join(experiment_path, 'params.pkl')
    with open(variant_path, 'rb') as f:
        variant = pickle.load(f)

    checkpoint_paths = [
        checkpoint_dir for checkpoint_dir in sorted(
            glob.iglob(os.path.join(experiment_path, 'checkpoint_*')),
            key=lambda d: float(d.split("checkpoint_")[1]))
    ]

    dump_dir = os.path.join(experiment_path, 'evaluations/')
    if not os.path.exists(dump_dir):
        os.makedirs(dump_dir)
    all_paths = []
    for checkpoint_dir in checkpoint_paths[::2]:

        with session.as_default():
            pickle_path = os.path.join(checkpoint_dir, 'checkpoint.pkl')
            with open(pickle_path, 'rb') as f:
                picklable = pickle.load(f)

        environment_params = (variant['environment_params']['evaluation']
                              if 'evaluation' in variant['environment_params']
                              else variant['environment_params']['training'])

        environment_params['kwargs']['device_path'] = '/dev/ttyUSB0'
        environment_params['kwargs']['camera_config'] = {
            'topic': '/kinect2_001144463747/qhd/image_color',
            'image_shape': (256, 256, 3)
        }
        environment_params['kwargs']['init_pos_range'] = list((np.array([
            0, -np.pi / 4, -np.pi / 2, -3 * np.pi / 4, -np.pi, np.pi /
            4, np.pi / 2, np.pi * 3 / 4
        ]) + (-75 * np.pi / 180)) % (2 * np.pi) - np.pi)
        environment_params['kwargs']['target_pos_range'] = [-75 * np.pi / 180]
        environment_params['kwargs']['cycle_inits'] = True

        evaluation_environment = get_environment_from_params(
            environment_params)

        policy = (get_policy_from_variant(variant, evaluation_environment))

        policy_weights = picklable['policy_weights']
        if variant['algorithm_params']['type'] in ['MultiSAC', 'MultiVICEGAN']:
            policy_weights = policy_weights[0]
        policy.set_weights(policy_weights)
        # dump_path = os.path.join(checkpoint_path, 'policy_params.pkl')
        # with open(dump_path, 'wb') as f:
        #     pickle.dump(picklable['policy_weights'], f)

        render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs}

        with policy.set_deterministic(args.deterministic):
            paths = rollouts(args.num_rollouts,
                             evaluation_environment,
                             policy,
                             path_length=args.max_path_length,
                             render_kwargs=render_kwargs)

        if render_kwargs.get('mode') == 'rgb_array':
            fps = 2 // getattr(evaluation_environment, 'dt', 1 / 30)
            for i, path in enumerate(paths):
                # video_save_dir = os.path.expanduser('/tmp/simulate_policy/')
                video_save_path = os.path.join(checkpoint_dir,
                                               f'episode_{i}.mp4')

                save_video(path['images'], video_save_path, fps=fps)
        all_paths.append(paths)

    with open(os.path.join(dump_dir, 'evaluation_paths.pkl'), 'wb') as f:
        pickle.dump(all_paths, f)
    return paths
예제 #28
0
    def _restore(self, checkpoint_dir):
        assert isinstance(checkpoint_dir, str), checkpoint_dir

        checkpoint_dir = checkpoint_dir.rstrip('/')

        with self._session.as_default():
            pickle_path = self._pickle_path(checkpoint_dir)
            with open(pickle_path, 'rb') as f:
                picklable = pickle.load(f)

        training_environment = self.training_environment = picklable[
            'training_environment']
        evaluation_environment = self.evaluation_environment = picklable[
            'evaluation_environment']

        replay_pool = self.replay_pool = (get_replay_pool_from_variant(
            self._variant, training_environment))

        if self._variant['run_params'].get('checkpoint_replay_pool', False):
            self._restore_replay_pool(checkpoint_dir)

        sampler = self.sampler = picklable['sampler']
        Qs = self.Qs = picklable['Qs']
        # policy = self.policy = picklable['policy']
        policy = self.policy = (get_policy_from_variant(
            self._variant, training_environment, Qs))
        self.policy.set_weights(picklable['policy_weights'])
        initial_exploration_policy = self.initial_exploration_policy = (
            get_policy('UniformPolicy', training_environment))

        algorithm_kwargs = {
            'variant': self._variant,
            'training_environment': self.training_environment,
            'evaluation_environment': self.evaluation_environment,
            'policy': policy,
            'initial_exploration_policy': initial_exploration_policy,
            'Qs': Qs,
            'pool': replay_pool,
            'sampler': sampler,
            'session': self._session,
        }

        if self._variant['algorithm_params']['type'] in [
                'SACClassifier', 'RAQ', 'VICE', 'VICEGAN', 'VICERAQ'
        ]:
            reward_classifier = self.reward_classifier = picklable[
                'reward_classifier']
            algorithm_kwargs['classifier'] = reward_classifier

            goal_examples_train, goal_examples_validation = \
                get_goal_example_from_variant(variant)
            algorithm_kwargs['goal_examples'] = goal_examples_train
            algorithm_kwargs['goal_examples_validation'] = \
                goal_examples_validation

        self.algorithm = get_algorithm_from_variant(**algorithm_kwargs)
        self.algorithm.__setstate__(picklable['algorithm'].__getstate__())

        tf_checkpoint = self._get_tf_checkpoint()
        status = tf_checkpoint.restore(
            tf.train.latest_checkpoint(
                os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0]))

        status.assert_consumed().run_restore_ops(self._session)
        initialize_tf_variables(self._session, only_uninitialized=True)

        # TODO(hartikainen): target Qs should either be checkpointed or pickled.
        for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets):
            Q_target.set_weights(Q.get_weights())

        self._built = True