Пример #1
0
def experiment(log_dir, variant_overwrite, cpu=False):
    if not cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    # Load experiment from file.
    env, _, data, variant = load_experiment(log_dir, variant_overwrite)
    assert all([
        a == b
        for a, b in zip(env.sampled_goal, variant['env_kwargs']['goal_prior'])
    ])

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format(
        variant['algo_kwargs']['num_episodes'],
        variant['algo_kwargs']['max_path_length'],
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        variant['algo_kwargs']['reward_scale'],
        variant['historical_policies_kwargs']['num_historical_policies'],
    )
    exp_id = create_exp_name(exp_id)
    out_dir = os.path.join(log_dir, exp_id)
    print('Logging to:', out_dir)
    setup_logger(
        log_dir=out_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if variant['intrinsic_reward'] == 'smm':
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if variant['historical_policies_kwargs']['num_historical_policies'] > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            log_dir=log_dir,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    algorithm.train()
Пример #2
0
def experiment(variant):
    intrinsic_reward = variant['intrinsic_reward']

    # Create environment.
    num_skills = variant['smm_kwargs']['num_skills'] if variant[
        'intrinsic_reward'] == 'smm' else 0
    env, training_env = create_env(variant['env_id'], variant['env_kwargs'],
                                   num_skills)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    # Initialize networks.
    net_size = variant['net_size']
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        hidden_sizes=[net_size, net_size],
        output_size=1,
    )
    vf = FlattenMlp(
        input_size=obs_dim,
        hidden_sizes=[net_size, net_size],
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        hidden_sizes=[net_size, net_size],
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=training_env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'])

    if intrinsic_reward == 'smm':
        discriminator = FlattenMlp(
            input_size=obs_dim - num_skills,
            hidden_sizes=[net_size, net_size],
            output_size=num_skills,
        )
        density_model = VAEDensity(input_size=obs_dim,
                                   num_skills=num_skills,
                                   code_dim=128,
                                   **variant['vae_density_kwargs'])

        # Overwrite appropriate functions of algorithm.
        smm_algorithm_hook = SMMHook(base_algorithm=algorithm,
                                     discriminator=discriminator,
                                     density_model=density_model,
                                     **variant['smm_kwargs'])
    elif intrinsic_reward == 'icm':
        embedding_model = FlattenMlp(
            input_size=obs_dim,
            hidden_sizes=[net_size, net_size],
            output_size=net_size,
        )
        forward_model = FlattenMlp(
            input_size=net_size + action_dim,
            hidden_sizes=[net_size, net_size],
            output_size=net_size,
        )
        inverse_model = FlattenMlp(
            input_size=net_size + net_size,
            hidden_sizes=[],
            output_size=action_dim,
        )

        # Overwrite appropriate functions of algorithm.
        ICMHook(base_algorithm=algorithm,
                embedding_model=embedding_model,
                forward_model=forward_model,
                inverse_model=inverse_model,
                **variant['icm_kwargs'])
    elif intrinsic_reward == 'count':
        count_algorithm_hook = CountHook(base_algorithm=algorithm,
                                         **variant['count_kwargs'])
    elif intrinsic_reward == 'pseudocount':
        density_model = VAEDensity(
            input_size=obs_dim,
            num_skills=0,
            code_dim=128,
            **variant['vae_density_kwargs'],
        )

        # Overwrite appropriate functions of algorithm.
        PseudocountHook(base_algorithm=algorithm,
                        density_model=density_model,
                        **variant['pseudocount_kwargs'])

    algorithm.to(ptu.device)
    algorithm.train()
def experiment(log_dir, variant_overwrite, cpu=False):
    if not cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    # Load experiment from file.
    env, _, data, variant = load_experiment(log_dir, variant_overwrite)
    #assert all([a == b for a, b in zip(print(samples)env.sampled_goal, variant['env_kwargs']['goal_prior'])])

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format(
        variant['algo_kwargs']['num_episodes'],
        variant['algo_kwargs']['max_path_length'],
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        variant['algo_kwargs']['reward_scale'],
        variant['historical_policies_kwargs']['num_historical_policies'],
    )
    exp_id = create_exp_name(exp_id)
    out_dir = os.path.join(log_dir, exp_id)
    print('Logging to:', out_dir)
    setup_logger(
        log_dir=out_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if variant['intrinsic_reward'] == 'smm':
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if variant['historical_policies_kwargs']['num_historical_policies'] > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            log_dir=log_dir,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    #algorithm.train()
    samples = algorithm.get_eval_paths()
    #for path in samples:
    #    print(path['observations'])

    #plt.figure()
    #plt.plot(samples[0]['observations'][:, 0], samples[0]['observations'][:, 1])
    #plt.plot(3, 2)
    #plt.show()
    print(env.reset())
    print(samples[0]['observations'])
    i = 0
    for path in samples:

        np.save('./outtem/out%i.npy' % i, path['observations'])
        i = i + 1
    #print(algorithm.policy.get_action(np.array([0,0])))
    from rlkit.samplers.util import rollout
    from rlkit.samplers.in_place import InPlacePathSampler
    #path=rollout(env,algorithm.eval_policy,50)
    eval_sampler = InPlacePathSampler(
        env=env,
        policy=algorithm.eval_policy,
        max_samples=100,
        max_path_length=50,
    )
    path = algorithm.eval_sampler.obtain_samples()
    print(path[0]['observations'])
Пример #4
0
def experiment(args):
    if not args.cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    variant_overwrite = dict(
        # Evaluate model on num_episodes.
        algo_kwargs=dict(
            reward_scale=args.reward_scale,
            collection_mode='episodic',
            num_episodes=args.num_episodes,
            max_path_length=args.max_path_length,
            render=args.render,

            # Evaluate without additional training
            num_updates_per_episode=0,
            min_num_steps_before_training=(
                args.max_path_length * args.num_episodes + 1),
        ),

        # Environment settings
        env_kwargs=dict(
            sample_goal=False,
            goal_prior=args.test_goal,
            shaped_rewards=[
                'object_off_table', 'object_goal_indicator',
                'object_gripper_indicator', 'action_penalty'
            ],
            terminate_upon_success=False,
            terminate_upon_failure=False,
        ),

        # SMM settings
        smm_kwargs=dict(
            # Posterior adaptation of latent skills p(z)
            update_p_z_prior_coeff=args.update_p_z_prior_coeff,

            # Turn off SMM reward.
            state_entropy_coeff=0,
            latent_entropy_coeff=0,
            latent_conditional_entropy_coeff=0,
            discriminator_lr=0,
        ),
    )

    # Load experiment from file.
    env, _, data, variant = load_experiment(args.logdir, variant_overwrite)
    assert all([a == b for a, b in zip(env.sampled_goal, args.test_goal)])
    variant.update(test_goal=list(env.sampled_goal))
    if args.num_historical_policies > 0:
        variant.update(historical_policies_kwargs=dict(
            log_dir=args.logdir,
            num_historical_policies=args.num_historical_policies,
            sample_strategy=args.sample_strategy,
            on_policy_prob=args.on_policy_prob,
        ))

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}-{}-opp{}'.format(
        args.num_episodes,
        args.max_path_length,
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        args.reward_scale,
        args.num_historical_policies,
        args.sample_strategy,
        args.on_policy_prob,
    )
    exp_id = create_exp_name(exp_id)
    log_dir = os.path.join(args.logdir, exp_id)
    print('Logging to:', log_dir)
    setup_logger(
        log_dir=log_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if 'smm_kwargs' in variant:
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if args.num_historical_policies > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    algorithm.train()