Пример #1
0
        'trainer_kwargs.awr_sample_actions': [
            False,
        ],
        'trainer_kwargs.clip_score': [
            2,
        ],
        'trainer_kwargs.awr_min_q': [
            True,
        ],
        'trainer_kwargs.reward_transform_kwargs': [
            None,
        ],
        'trainer_kwargs.terminal_transform_kwargs': [
            dict(m=0, b=0),
        ],
        'qf_kwargs.output_activation': [Clamp(max=0)],
    }
    sweeper = hyp.DeterministicHyperparameterSweeper(
        search_space,
        default_parameters=variant,
    )

    variants = []
    for variant in sweeper.iterate_hyperparameters():
        env_type = variant['env_type']
        eval_goals = 'sasha/presampled_goals/affordances/combined/{0}_goals.pkl'.format(
            env_type)
        variant['presampled_goal_kwargs']['eval_goals'] = eval_goals

        if env_type in ['top_drawer', 'bottom_drawer']:
            variant['env_class'] = SawyerRigAffordancesV0
Пример #2
0
 policy_class=GaussianPolicy,
 policy_kwargs=dict(
     hidden_sizes=[
         256,
         256,
     ],
     max_log_std=0,
     min_log_std=-6,
     std_architecture="values",
 ),
 qf_kwargs=dict(
     hidden_sizes=[
         256,
         256,
     ],
     output_activation=Clamp(max=0),  # rewards are <= 0
 ),
 version="normal",
 collection_mode="batch",
 trainer_kwargs=dict(
     discount=0.99,
     soft_target_tau=5e-3,
     target_update_period=1,
     policy_lr=3e-4,
     qf_lr=3e-4,
     reward_scale=1,
     beta=1,
     use_automatic_entropy_tuning=False,
     alpha=0,
     compute_bc=False,
     awr_min_q=True,