'trainer_kwargs.awr_sample_actions': [ False, ], 'trainer_kwargs.clip_score': [ 2, ], 'trainer_kwargs.awr_min_q': [ True, ], 'trainer_kwargs.reward_transform_kwargs': [ None, ], 'trainer_kwargs.terminal_transform_kwargs': [ dict(m=0, b=0), ], 'qf_kwargs.output_activation': [Clamp(max=0)], } sweeper = hyp.DeterministicHyperparameterSweeper( search_space, default_parameters=variant, ) variants = [] for variant in sweeper.iterate_hyperparameters(): env_type = variant['env_type'] eval_goals = 'sasha/presampled_goals/affordances/combined/{0}_goals.pkl'.format( env_type) variant['presampled_goal_kwargs']['eval_goals'] = eval_goals if env_type in ['top_drawer', 'bottom_drawer']: variant['env_class'] = SawyerRigAffordancesV0
policy_class=GaussianPolicy, policy_kwargs=dict( hidden_sizes=[ 256, 256, ], max_log_std=0, min_log_std=-6, std_architecture="values", ), qf_kwargs=dict( hidden_sizes=[ 256, 256, ], output_activation=Clamp(max=0), # rewards are <= 0 ), version="normal", collection_mode="batch", trainer_kwargs=dict( discount=0.99, soft_target_tau=5e-3, target_update_period=1, policy_lr=3e-4, qf_lr=3e-4, reward_scale=1, beta=1, use_automatic_entropy_tuning=False, alpha=0, compute_bc=False, awr_min_q=True,