def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] input_dims = dims.copy() # DDPG agent # env = cached_make_env(params['make_env']) # env.reset() ddpg_params.update({'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, 'bc_loss': params['bc_loss'], 'q_filter': params['q_filter'], 'num_demo': params['num_demo'], 'demo_batch_size': params['demo_batch_size'], 'prm_loss_weight': params['prm_loss_weight'], 'aux_loss_weight': params['aux_loss_weight'], }) ddpg_params['info'] = { 'env_name': params['env_name'], } try: policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) except: policy = DDPG(reuse=True, **ddpg_params, use_mpi=use_mpi) return policy
def configure_policy(dims, params): sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] reuse = params['reuse'] use_mpi = params['use_mpi'] input_dims = dims.copy() # DDPG agent env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if params['clip_return'] else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, 'reuse': reuse, 'use_mpi': use_mpi, # 'n_preds' : 0, # 'h_level' : 0, # 'subgoal_scale': [1,1,1,1], # 'subgoal_offset': [0, 0, 0, 0], }) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(**ddpg_params) return policy
def configure_ddpg(dims, params, buffers, reuse=False, use_mpi=True, clip_return=True, t_id=None): sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] input_dims = dims.copy() # DDPG agent env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({ 'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, 'task_replay': params['task_replay'], 'structure': params['structure'], 'tasks_ag_id': params['tasks_ag_id'], 'tasks_g_id': params['tasks_g_id'], 'eps_task': params['eps_task'] }) if t_id is not None: # give task id to rollout worker in the case of multiple task-experts ddpg_params.update({'t_id': t_id}) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(reuse=reuse, **ddpg_params, buffers=buffers, use_mpi=use_mpi) return policy
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] temperature = params['temperature'] prioritization = params['prioritization'] env_name = params['env_name'] max_timesteps = params['max_timesteps'] rank_method = params['rank_method'] input_dims = dims.copy() # DDPG agent env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({ 'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, 'temperature': temperature, 'prioritization': prioritization, 'env_name': env_name, 'max_timesteps': max_timesteps, 'rank_method': rank_method, }) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) return policy
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): """ configure a DDPG model from parameters :param dims: ({str: int}) the dimensions :param params: (dict) the DDPG parameters :param reuse: (bool) whether or not the networks should be reused :param use_mpi: (bool) whether or not to use MPI :param clip_return: (float) clip returns to be in [-clip_return, clip_return] :return: (her.DDPG) the ddpg model """ sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] input_dims = dims.copy() # DDPG agent env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({ 'input_dims': input_dims, # agent takes an input observations 'time_horizon': params['time_horizon'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, }) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) return policy
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] # print('rollout_batch_size', rollout_batch_size) ddpg_params = params['ddpg_params'] # print('ddpg_params is', ddpg_params) input_dims = dims.copy() # print('input_dims is', input_dims) # DDPG agent env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({ 'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, }) ddpg_params['info'] = { 'env_name': params['env_name'], } print('ddpg_params is', ddpg_params) print('use_mpi is', use_mpi) policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) return policy
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=1, env=None, to_goal=None, logger=None): sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] input_dims = dims.copy() # DDPG agent # env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': False, # clip positive returns #'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'clip_return': params['goal_weight'] if clip_return == 1 else (1. / (1. - gamma)) * params['goal_weight'] if clip_return == 2 else np.inf, 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, 'env': env, 'to_goal': to_goal, }) if 'sample_expert' in params: ddpg_params.update({ 'sample_expert': params['sample_expert'], 'expert_batch_size': params['expert_batch_size'], 'bc_loss': params['bc_loss'], 'anneal_bc': params['anneal_bc'], }) if 'nearby_action_penalty' in params: ddpg_params.update({ 'nearby_action_penalty': params['nearby_action_penalty'], 'nearby_penalty_weight': params['nearby_penalty_weight'], }) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) return policy
def configure_ddpg(dims, params, pretrain_weights, reuse=False, use_mpi=True, clip_return=True): sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] env_name = params['env_name'] max_timesteps = params['max_timesteps'] num_objective = params['num_objective'] input_dims = dims.copy() # DDPG agent env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({ 'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) * num_objective if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, 'env_name': env_name, 'max_timesteps': max_timesteps, 'r_scale': params['r_scale'], 'mi_r_scale': params['mi_r_scale'], 'mi_end_epoch': params['mi_end_epoch'], 'sk_r_scale': params['sk_r_scale'], 'et_r_scale': params['et_r_scale'], 'pretrain_weights': pretrain_weights, 'finetune_pi': params['finetune_pi'], 'mi_prioritization': params['mi_prioritization'], 'sac': params['sac'], }) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) return policy
def configure_ddpg(dims, params, FLAGS, agent_params, reuse=False, use_mpi=True, clip_return=True): # def configure_ddpg(params, FLAGS, agent_params, dims, reuse=False, use_mpi=True, clip_return=True):## sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] ddpg_params = params['ddpg_params'] input_dims = dims.copy() # print("DEBUG, ddpg_params={}".format(params)) print("DEBUG, input_dims={}".format(input_dims)) # DDPG agent -> TD3 agent env = cached_make_env(params['make_env']) env.reset() ## agent로 넘겨줘서 자른부분 # sess = tf.Session() # subgoal_test_perc = agent_params["subgoal_test_perc"] # agent_params = agent_params # layers = [Layer(i,FLAGS,env, sess,agent_params) for i in range(FLAGS.layers)] # goal_array = [None for i in range(FLAGS.layers)] # steps_taken = 0 ## ddpg_params.update({'input_dims': input_dims, # agent takes an input observations 'T': params['T'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': gamma, 'bc_loss': params['bc_loss'], 'q_filter': params['q_filter'], 'num_demo': params['num_demo'], 'demo_batch_size': params['demo_batch_size'], 'prm_loss_weight': params['prm_loss_weight'], 'aux_loss_weight': params['aux_loss_weight'], 'td3_policy_freq': params['td3_policy_freq'], ## 'td3_policy_noise': params['td3_policy_noise'], ## 'td3_noise_clip': params['td3_noise_clip'] ## }) ddpg_params['info'] = { 'env_name': params['env_name'], } print(ddpg_params) # layers = [Layer(i,FLAGS,env, sess,agent_params) for i in range(FLAGS.layers)] # goal_array = [None for i in range(FLAGS.layers)] # reuse=reuse, **ddpg_params, use_mpi=use_mpi # def __init__(self, FLAGS, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, # Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, # rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, # bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, # # sample_transitions, gamma, reuse=False, **kwargs): # sample_transitions, gamma, td3_policy_freq, td3_policy_noise, td3_noise_clip, reuse=False, *agent_params, **kwargs): ## # policy = DDPG(FLAGS, ddpg_params, reuse, agent_params, use_mpi=use_mpi) ##policy라는 DDPG instance를 생성 policy = DDPG(FLAGS, reuse=reuse, **agent_params, **ddpg_params, use_mpi=use_mpi) return policy
def configure_all(dims, params, reuse=False, policy_pkl=None): env = cached_make_env(params['make_env']) env.reset(reset_goal=False) #get_reset_obs() params['T'] = env.spec.max_episode_steps params['gamma'] = 1. - 1. / params['T'] params['max_u'] = env.action_space.high # params['goal_range'] = env.goal_range # params['goal_center'] = env.goal_center # Extract relevant parameters. prepare_ve_params(params) ddpg_sample_transitions, ve_sample_transitions = configure_ve_her(params) # DDPG agent if policy_pkl is not None: # load froze policy import joblib logger.info('loading policy...') data = joblib.load(policy_pkl) policy = data['policy'] else: policy = DDPG( reuse=reuse, input_dims=dims.copy(), scope='ddpg', T=params['T'], gamma=params['gamma'], rollout_batch_size=params['rollout_batch_size'], sample_transitions=ddpg_sample_transitions, subtract_goals=simple_goal_subtract, **params['ddpg_params'], ) value_ensemble = ValueEnsemble( reuse=reuse, input_dims=dims.copy(), scope='ve' if policy_pkl is None else 've-trainable', T=params['T'], gamma=params['gamma'], rollout_batch_size=params['rollout_batch_size'], sample_transitions=ve_sample_transitions, subtract_goals=simple_goal_subtract, **params['ve_params']) if False: goal_presampler = configure_goal_presampler(params) goal_params = params['goal_params'] goal_sampler_factory = make_goal_sampler_factory( init_ob=env.init_ob, goal_presampler=goal_presampler, value_ensemble=value_ensemble, policy=policy, presample_size=goal_params['presample_size'], disagreement_str=goal_params['disagreement_str'], n_reused_states=goal_params['n_reused_states'], ) # for evaluation: sample from grid intersections with uniform probability # number of grids determined by sampling_res feasible_grid_goal_presampler, _ = make_grid_goal_presampler( env=env, sampling_res=3, uniform_noise=False, feasible=True) feasible_uniform_grid_goal_sampler, _ = make_uniform_goal_sampler( feasible_grid_goal_presampler) # TODO: plot all goals here return policy, value_ensemble, goal_sampler_factory, feasible_uniform_grid_goal_sampler # goal sampling function to be passed in vector env from baselines.her.experiment.config import configure_disagreement params['gs_params'] = dict( n_candidates=params['presample_size'], disagreement_fun_name=params['disagreement_str']) print(params['disagreement_str']) sample_disagreement_goals_fun, sample_uniform_goals_fun = configure_disagreement( params, value_ensemble=value_ensemble, policy=policy) return policy, value_ensemble, sample_disagreement_goals_fun, sample_uniform_goals_fun
def configure_ve_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True, policy_pkl=None): # Extract relevant parameters. gamma = params['gamma'] rollout_batch_size = params['rollout_batch_size'] # env = cached_make_env(params['make_env']) # env.get_reset_obs() # env.reset() ddpg_sample_transitions, ve_sample_transitions = configure_ve_her(params) # DDPG agent if policy_pkl is not None: # load froze policy import joblib logger.info('loading policy...') data = joblib.load(policy_pkl) policy = data['policy'] else: ddpg_params = params['ddpg_params'] ddpg_params.update({'input_dims': dims.copy(), # agent takes an input observations 'T': params['T'], 'scope': 'ddpg', 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'sample_transitions': ddpg_sample_transitions, 'gamma': gamma, 'bc_loss': params['bc_loss'], 'q_filter': params['q_filter'], 'num_demo': params['num_demo'], 'demo_batch_size': params['demo_batch_size'], 'prm_loss_weight': params['prm_loss_weight'], 'aux_loss_weight': params['aux_loss_weight'], }) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi) ve_params = params['ve_params'] ve_params.update({ 'input_dims': dims.copy(), 'T': params['T'], 'scope': 've' if policy_pkl is None else 've-trainable', # a hack to avoid duplicate vars when policy_pkl is loaded 'rollout_batch_size': rollout_batch_size, 'subtract_goals': simple_goal_subtract, 'clip_pos_returns': True, # following ddpg configuration 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # following ddpg configuration 'sample_transitions': ve_sample_transitions, 'gamma': gamma, # TODO: tmp hack below 'polyak': ddpg_params['polyak'], }) value_ensemble = ValueEnsemble(reuse=reuse, **ve_params) # goal sampling function to be passed in vector env sample_disagreement_goals_fun, sample_uniform_goals_fun = configure_disagreement( params, value_ensemble=value_ensemble, policy=policy ) return policy, value_ensemble, sample_disagreement_goals_fun, sample_uniform_goals_fun