示例#1
0
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']

    input_dims = dims.copy()

    # DDPG agent
    # env = cached_make_env(params['make_env'])
    # env.reset()
    ddpg_params.update({'input_dims': input_dims,  # agent takes an input observations
                        'T': params['T'],
                        'clip_pos_returns': True,  # clip positive returns
                        'clip_return': (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
                        'rollout_batch_size': rollout_batch_size,
                        'subtract_goals': simple_goal_subtract,
                        'sample_transitions': sample_her_transitions,
                        'gamma': gamma,
                        'bc_loss': params['bc_loss'],
                        'q_filter': params['q_filter'],
                        'num_demo': params['num_demo'],
                        'demo_batch_size': params['demo_batch_size'],
                        'prm_loss_weight': params['prm_loss_weight'],
                        'aux_loss_weight': params['aux_loss_weight'],
                        })
    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    try: policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
    except: 
        
        policy = DDPG(reuse=True, **ddpg_params, use_mpi=use_mpi)
    return policy
def configure_policy(dims, params):
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']
    reuse = params['reuse']
    use_mpi = params['use_mpi']
    input_dims = dims.copy()

    # DDPG agent
    env = cached_make_env(params['make_env'])
    env.reset()
    ddpg_params.update({'input_dims': input_dims,  # agent takes an input observations
                        'T': params['T'],
                        'clip_pos_returns': True,  # clip positive returns
                        'clip_return': (1. / (1. - gamma)) if params['clip_return'] else np.inf,  # max abs of return
                        'rollout_batch_size': rollout_batch_size,
                        'subtract_goals': simple_goal_subtract,
                        'sample_transitions': sample_her_transitions,
                        'gamma': gamma,
                        'reuse': reuse,
                        'use_mpi': use_mpi,
                        # 'n_preds' : 0,
                        # 'h_level' : 0,
                        # 'subgoal_scale': [1,1,1,1],
                        # 'subgoal_offset': [0, 0, 0, 0],
                        })
    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    policy = DDPG(**ddpg_params)

    return policy
示例#3
0
def configure_ddpg(dims,
                   params,
                   buffers,
                   reuse=False,
                   use_mpi=True,
                   clip_return=True,
                   t_id=None):
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']

    input_dims = dims.copy()

    # DDPG agent
    env = cached_make_env(params['make_env'])
    env.reset()
    ddpg_params.update({
        'input_dims':
        input_dims,  # agent takes an input observations
        'T':
        params['T'],
        'clip_pos_returns':
        True,  # clip positive returns
        'clip_return':
        (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
        'rollout_batch_size':
        rollout_batch_size,
        'subtract_goals':
        simple_goal_subtract,
        'sample_transitions':
        sample_her_transitions,
        'gamma':
        gamma,
        'task_replay':
        params['task_replay'],
        'structure':
        params['structure'],
        'tasks_ag_id':
        params['tasks_ag_id'],
        'tasks_g_id':
        params['tasks_g_id'],
        'eps_task':
        params['eps_task']
    })

    if t_id is not None:
        # give task id to rollout worker in the case of multiple task-experts
        ddpg_params.update({'t_id': t_id})

    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    policy = DDPG(reuse=reuse, **ddpg_params, buffers=buffers, use_mpi=use_mpi)
    return policy
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']
    temperature = params['temperature']
    prioritization = params['prioritization']
    env_name = params['env_name']
    max_timesteps = params['max_timesteps']
    rank_method = params['rank_method']

    input_dims = dims.copy()

    # DDPG agent
    env = cached_make_env(params['make_env'])
    env.reset()
    ddpg_params.update({
        'input_dims':
        input_dims,  # agent takes an input observations
        'T':
        params['T'],
        'clip_pos_returns':
        True,  # clip positive returns
        'clip_return':
        (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
        'rollout_batch_size':
        rollout_batch_size,
        'subtract_goals':
        simple_goal_subtract,
        'sample_transitions':
        sample_her_transitions,
        'gamma':
        gamma,
        'temperature':
        temperature,
        'prioritization':
        prioritization,
        'env_name':
        env_name,
        'max_timesteps':
        max_timesteps,
        'rank_method':
        rank_method,
    })
    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
    return policy
示例#5
0
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
    """
    configure a DDPG model from parameters

    :param dims: ({str: int}) the dimensions
    :param params: (dict) the DDPG parameters
    :param reuse: (bool) whether or not the networks should be reused
    :param use_mpi: (bool) whether or not to use MPI
    :param clip_return: (float) clip returns to be in [-clip_return, clip_return]
    :return: (her.DDPG) the ddpg model
    """
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']

    input_dims = dims.copy()

    # DDPG agent
    env = cached_make_env(params['make_env'])
    env.reset()
    ddpg_params.update({
        'input_dims':
        input_dims,  # agent takes an input observations
        'time_horizon':
        params['time_horizon'],
        'clip_pos_returns':
        True,  # clip positive returns
        'clip_return':
        (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
        'rollout_batch_size':
        rollout_batch_size,
        'subtract_goals':
        simple_goal_subtract,
        'sample_transitions':
        sample_her_transitions,
        'gamma':
        gamma,
    })
    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
    return policy
示例#6
0
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']

    rollout_batch_size = params['rollout_batch_size']
    # print('rollout_batch_size', rollout_batch_size)

    ddpg_params = params['ddpg_params']
    # print('ddpg_params is', ddpg_params)

    input_dims = dims.copy()
    # print('input_dims is', input_dims)

    # DDPG agent
    env = cached_make_env(params['make_env'])
    env.reset()
    ddpg_params.update({
        'input_dims':
        input_dims,  # agent takes an input observations
        'T':
        params['T'],
        'clip_pos_returns':
        True,  # clip positive returns
        'clip_return':
        (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
        'rollout_batch_size':
        rollout_batch_size,
        'subtract_goals':
        simple_goal_subtract,
        'sample_transitions':
        sample_her_transitions,
        'gamma':
        gamma,
    })
    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }

    print('ddpg_params is', ddpg_params)
    print('use_mpi is', use_mpi)

    policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
    return policy
示例#7
0
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=1, env=None, to_goal=None, logger=None):
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']

    input_dims = dims.copy()

    # DDPG agent
    # env = cached_make_env(params['make_env'])
    env.reset()
    ddpg_params.update({'input_dims': input_dims,  # agent takes an input observations
                        'T': params['T'],
                        'clip_pos_returns': False,  # clip positive returns
                        #'clip_return': (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
                        'clip_return': params['goal_weight'] if clip_return == 1 else (1. / (1. - gamma)) * params['goal_weight'] if clip_return == 2 else np.inf,
                        'rollout_batch_size': rollout_batch_size,
                        'subtract_goals': simple_goal_subtract,
                        'sample_transitions': sample_her_transitions,
                        'gamma': gamma,
                        'env': env,
                        'to_goal': to_goal,

                        })
    if 'sample_expert' in params:
        ddpg_params.update({
            'sample_expert': params['sample_expert'],
            'expert_batch_size': params['expert_batch_size'],
            'bc_loss': params['bc_loss'],
            'anneal_bc': params['anneal_bc'],
        })
    if 'nearby_action_penalty' in params:
        ddpg_params.update({
            'nearby_action_penalty': params['nearby_action_penalty'],
            'nearby_penalty_weight': params['nearby_penalty_weight'],
        })

    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
    return policy
示例#8
0
def configure_ddpg(dims,
                   params,
                   pretrain_weights,
                   reuse=False,
                   use_mpi=True,
                   clip_return=True):
    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']
    env_name = params['env_name']
    max_timesteps = params['max_timesteps']
    num_objective = params['num_objective']

    input_dims = dims.copy()

    # DDPG agent
    env = cached_make_env(params['make_env'])
    env.reset()
    ddpg_params.update({
        'input_dims':
        input_dims,  # agent takes an input observations
        'T':
        params['T'],
        'clip_pos_returns':
        True,  # clip positive returns
        'clip_return': (1. / (1. - gamma)) *
        num_objective if clip_return else np.inf,  # max abs of return
        'rollout_batch_size':
        rollout_batch_size,
        'subtract_goals':
        simple_goal_subtract,
        'sample_transitions':
        sample_her_transitions,
        'gamma':
        gamma,
        'env_name':
        env_name,
        'max_timesteps':
        max_timesteps,
        'r_scale':
        params['r_scale'],
        'mi_r_scale':
        params['mi_r_scale'],
        'mi_end_epoch':
        params['mi_end_epoch'],
        'sk_r_scale':
        params['sk_r_scale'],
        'et_r_scale':
        params['et_r_scale'],
        'pretrain_weights':
        pretrain_weights,
        'finetune_pi':
        params['finetune_pi'],
        'mi_prioritization':
        params['mi_prioritization'],
        'sac':
        params['sac'],
    })
    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
    return policy
示例#9
0
def configure_ddpg(dims, params, FLAGS, agent_params, reuse=False, use_mpi=True, clip_return=True):
# def configure_ddpg(params, FLAGS, agent_params, dims, reuse=False, use_mpi=True, clip_return=True):##

    sample_her_transitions = configure_her(params)
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    ddpg_params = params['ddpg_params']

    input_dims = dims.copy()

    # print("DEBUG, ddpg_params={}".format(params))
    print("DEBUG, input_dims={}".format(input_dims))


    # DDPG agent -> TD3 agent
    env = cached_make_env(params['make_env'])
    env.reset()

    ## agent로 넘겨줘서 자른부분
    # sess = tf.Session()
    # subgoal_test_perc = agent_params["subgoal_test_perc"]
    # agent_params = agent_params


    # layers = [Layer(i,FLAGS,env, sess,agent_params) for i in range(FLAGS.layers)]
    # goal_array = [None for i in range(FLAGS.layers)]
    # steps_taken = 0
    ##

    ddpg_params.update({'input_dims': input_dims,  # agent takes an input observations
                        'T': params['T'],
                        'clip_pos_returns': True,  # clip positive returns
                        'clip_return': (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
                        'rollout_batch_size': rollout_batch_size,
                        'subtract_goals': simple_goal_subtract,
                        'sample_transitions': sample_her_transitions,
                        'gamma': gamma,
                        'bc_loss': params['bc_loss'],
                        'q_filter': params['q_filter'],
                        'num_demo': params['num_demo'],
                        'demo_batch_size': params['demo_batch_size'],
                        'prm_loss_weight': params['prm_loss_weight'],
                        'aux_loss_weight': params['aux_loss_weight'],
                        
                        'td3_policy_freq': params['td3_policy_freq'], ##
                        'td3_policy_noise': params['td3_policy_noise'], ##
                        'td3_noise_clip': params['td3_noise_clip'] ##
                        })
    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }
    print(ddpg_params)
    # layers = [Layer(i,FLAGS,env, sess,agent_params) for i in range(FLAGS.layers)]
    # goal_array = [None for i in range(FLAGS.layers)]
    # reuse=reuse, **ddpg_params, use_mpi=use_mpi
        # def __init__(self, FLAGS, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
        #          Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
        #          rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
        #          bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight,
                 
        #         #  sample_transitions, gamma, reuse=False, **kwargs):
        #          sample_transitions, gamma, td3_policy_freq, td3_policy_noise, td3_noise_clip, reuse=False, *agent_params, **kwargs): ##
    # policy = DDPG(FLAGS, ddpg_params, reuse, agent_params, use_mpi=use_mpi) ##policy라는 DDPG instance를 생성
    policy = DDPG(FLAGS, reuse=reuse, **agent_params, **ddpg_params, use_mpi=use_mpi)
    return policy
示例#10
0
def configure_all(dims, params, reuse=False, policy_pkl=None):
    env = cached_make_env(params['make_env'])
    env.reset(reset_goal=False)  #get_reset_obs()

    params['T'] = env.spec.max_episode_steps
    params['gamma'] = 1. - 1. / params['T']
    params['max_u'] = env.action_space.high
    # params['goal_range'] = env.goal_range
    # params['goal_center'] = env.goal_center

    # Extract relevant parameters.
    prepare_ve_params(params)
    ddpg_sample_transitions, ve_sample_transitions = configure_ve_her(params)

    # DDPG agent
    if policy_pkl is not None:
        # load froze policy
        import joblib
        logger.info('loading policy...')
        data = joblib.load(policy_pkl)
        policy = data['policy']

    else:
        policy = DDPG(
            reuse=reuse,
            input_dims=dims.copy(),
            scope='ddpg',
            T=params['T'],
            gamma=params['gamma'],
            rollout_batch_size=params['rollout_batch_size'],
            sample_transitions=ddpg_sample_transitions,
            subtract_goals=simple_goal_subtract,
            **params['ddpg_params'],
        )

    value_ensemble = ValueEnsemble(
        reuse=reuse,
        input_dims=dims.copy(),
        scope='ve' if policy_pkl is None else 've-trainable',
        T=params['T'],
        gamma=params['gamma'],
        rollout_batch_size=params['rollout_batch_size'],
        sample_transitions=ve_sample_transitions,
        subtract_goals=simple_goal_subtract,
        **params['ve_params'])

    if False:
        goal_presampler = configure_goal_presampler(params)
        goal_params = params['goal_params']
        goal_sampler_factory = make_goal_sampler_factory(
            init_ob=env.init_ob,
            goal_presampler=goal_presampler,
            value_ensemble=value_ensemble,
            policy=policy,
            presample_size=goal_params['presample_size'],
            disagreement_str=goal_params['disagreement_str'],
            n_reused_states=goal_params['n_reused_states'],
        )

        # for evaluation: sample from grid intersections with uniform probability
        # number of grids determined by sampling_res
        feasible_grid_goal_presampler, _ = make_grid_goal_presampler(
            env=env, sampling_res=3, uniform_noise=False, feasible=True)
        feasible_uniform_grid_goal_sampler, _ = make_uniform_goal_sampler(
            feasible_grid_goal_presampler)
        # TODO: plot all goals here
        return policy, value_ensemble, goal_sampler_factory, feasible_uniform_grid_goal_sampler

    # goal sampling function to be passed in vector env
    from baselines.her.experiment.config import configure_disagreement
    params['gs_params'] = dict(
        n_candidates=params['presample_size'],
        disagreement_fun_name=params['disagreement_str'])
    print(params['disagreement_str'])
    sample_disagreement_goals_fun, sample_uniform_goals_fun = configure_disagreement(
        params, value_ensemble=value_ensemble, policy=policy)
    return policy, value_ensemble, sample_disagreement_goals_fun, sample_uniform_goals_fun
示例#11
0
def configure_ve_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True, policy_pkl=None):
    # Extract relevant parameters.
    gamma = params['gamma']
    rollout_batch_size = params['rollout_batch_size']
    # env = cached_make_env(params['make_env'])
    # env.get_reset_obs()
    # env.reset()

    ddpg_sample_transitions, ve_sample_transitions = configure_ve_her(params)

    # DDPG agent
    if policy_pkl is not None:
        # load froze policy
        import joblib
        logger.info('loading policy...')
        data = joblib.load(policy_pkl)
        policy = data['policy']

    else:
        ddpg_params = params['ddpg_params']
        ddpg_params.update({'input_dims': dims.copy(),  # agent takes an input observations
                            'T': params['T'],
                            'scope': 'ddpg',
                            'clip_pos_returns': True,  # clip positive returns
                            'clip_return': (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
                            'rollout_batch_size': rollout_batch_size,
                            'subtract_goals': simple_goal_subtract,
                            'sample_transitions': ddpg_sample_transitions,
                            'gamma': gamma,
                            'bc_loss': params['bc_loss'],
                            'q_filter': params['q_filter'],
                            'num_demo': params['num_demo'],
                            'demo_batch_size': params['demo_batch_size'],
                            'prm_loss_weight': params['prm_loss_weight'],
                            'aux_loss_weight': params['aux_loss_weight'],
                            })
        ddpg_params['info'] = {
            'env_name': params['env_name'],
        }
        policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)

    ve_params = params['ve_params']
    ve_params.update({
        'input_dims': dims.copy(),
        'T': params['T'],
        'scope': 've' if policy_pkl is None else 've-trainable',  # a hack to avoid duplicate vars when policy_pkl is loaded
        'rollout_batch_size': rollout_batch_size,
        'subtract_goals': simple_goal_subtract,
        'clip_pos_returns': True,  # following ddpg configuration
        'clip_return': (1. / (1. - gamma)) if clip_return else np.inf,  # following ddpg configuration
        'sample_transitions': ve_sample_transitions,
        'gamma': gamma,
        # TODO: tmp hack below
        'polyak': ddpg_params['polyak'],
    })
    value_ensemble = ValueEnsemble(reuse=reuse, **ve_params)

    # goal sampling function to be passed in vector env
    sample_disagreement_goals_fun, sample_uniform_goals_fun = configure_disagreement(
        params,
        value_ensemble=value_ensemble,
        policy=policy
    )

    return policy, value_ensemble, sample_disagreement_goals_fun, sample_uniform_goals_fun