def trained_disturber(variant):
    env_name = variant["env_name"]
    env = get_env_from_name(env_name)
    env_params = variant["env_params"]

    eval_params = variant["eval_params"]
    policy_params = variant["alg_params"]
    disturber_params = variant["disturber_params"]
    build_func = get_policy(variant["algorithm_name"])
    if "Fetch" in env_name or "Hand" in env_name:
        s_dim = (env.observation_space.spaces["observation"].shape[0] +
                 env.observation_space.spaces["achieved_goal"].shape[0] +
                 env.observation_space.spaces["desired_goal"].shape[0])
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    d_dim = env_params["disturbance dim"]
    policy = build_func(a_dim, s_dim, d_dim, policy_params)
    disturbance_chanel_list = np.nonzero(
        disturber_params["disturbance_magnitude"])[0]
    disturber_params["disturbance_chanel_list"] = disturbance_chanel_list
    disturber = Disturber(d_dim, s_dim, disturber_params)
    disturber.restore(eval_params["path"])

    log_path = variant["log_path"] + "/eval/trained_disturber"
    variant["eval_params"].update({"magnitude": 0})
    logger.configure(dir=log_path, format_strs=["csv"])

    diagnostic_dict, _ = evaluation(variant, env, policy, disturber)

    string_to_print = []
    [
        string_to_print.extend(
            [key, ":", str(round(diagnostic_dict[key], 2)), "|"])
        for key in diagnostic_dict.keys()
    ]
    print("".join(string_to_print))

    [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()]
    logger.dumpkvs()
Exemplo n.º 2
0
def trained_disturber(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    env_params = variant['env_params']

    eval_params = variant['eval_params']
    policy_params = variant['alg_params']
    disturber_params = variant['disturber_params']
    build_func = get_policy(variant['algorithm_name'])
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0] \
                + env.observation_space.spaces['achieved_goal'].shape[0] + \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    d_dim = env_params['disturbance dim']
    policy = build_func(a_dim, s_dim, d_dim, policy_params)
    disturbance_chanel_list = np.nonzero(
        disturber_params['disturbance_magnitude'])[0]
    disturber_params['disturbance_chanel_list'] = disturbance_chanel_list
    disturber = Disturber(d_dim, s_dim, disturber_params)
    disturber.restore(eval_params['path'])

    log_path = variant['log_path'] + '/eval/trained_disturber'
    variant['eval_params'].update({'magnitude': 0})
    logger.configure(dir=log_path, format_strs=['csv'])

    diagnostic_dict, _ = evaluation(variant, env, policy, disturber)

    string_to_print = []
    [
        string_to_print.extend(
            [key, ':', str(round(diagnostic_dict[key], 2)), '|'])
        for key in diagnostic_dict.keys()
    ]
    print(''.join(string_to_print))

    [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()]
    logger.dumpkvs()
Exemplo n.º 3
0
def train_v2(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_build_fun = get_policy(variant['algorithm_name'])
    policy_params = variant['alg_params']
    disturber_params = variant['disturber_params']
    iter_of_actor_train = policy_params['iter_of_actor_train_per_epoch']
    iter_of_disturber_train = policy_params[
        'iter_of_disturber_train_per_epoch']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # if disturber_params['process_noise']:
    #     d_dim = disturber_params['noise_dim']
    # else:
    #     d_dim = env_params['disturbance dim']
    d_dim = np.nonzero(disturber_params['disturbance_magnitude'])[0].shape[0]
    disturbance_chanel_list = np.nonzero(
        disturber_params['disturbance_magnitude'])[0]
    disturber_params['disturbance_chanel_list'] = disturbance_chanel_list
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fun(a_dim, s_dim, d_dim, policy_params)
    disturber = Disturber(d_dim, s_dim, disturber_params)

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': d_dim,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'finite_horizon': policy_params['finite_horizon'],
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0

    last_actor_training_paths = deque(maxlen=store_last_n_paths)
    last_disturber_training_paths = deque(maxlen=store_last_n_paths)
    actor_training_started = False
    disturber_training_started = False
    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])
    logger.logkv('ita', policy_params['ita'])
    logger.logkv('energy_decay_rate', disturber_params['energy_decay_rate'])
    logger.logkv('magnitude', disturber_params['disturbance_magnitude'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)
    for iter in range(max_episodes):

        for i in range(iter_of_actor_train):

            current_path = {
                'rewards': [],
                'disturbance_mag': [],
                'a_loss': [],
                'alpha': [],
                'lyapunov_error': [],
                'labda': [],
                'critic_error': [],
                'entropy': [],
            }

            if global_step > max_global_steps:
                break

            s = env.reset()
            if 'Fetch' in env_name or 'Hand' in env_name:
                s = np.concatenate([s[key] for key in s.keys()])

            for j in range(max_ep_steps):
                if Render:
                    env.render()
                a = policy.choose_action(s, True)
                action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                    a_lowerbound) / 2
                disturbance, raw_disturbance = disturber.choose_action(s, j)
                # Run in simulator
                # disturbance = np.array([0])
                disturbance_input = np.zeros([a_dim + s_dim])
                disturbance_input[disturbance_chanel_list] = disturbance
                s_, r, done, info = env.step(action,
                                             process_noise=disturbance_input)
                if 'Fetch' in env_name or 'Hand' in env_name:
                    s_ = np.concatenate([s_[key] for key in s_.keys()])
                    if info['done'] > 0:
                        done = True

                if actor_training_started:
                    global_step += 1

                if j == max_ep_steps - 1:
                    done = True

                terminal = 1. if done else 0.
                pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_)
                # policy.store_transition(s, a, disturbance, r,0, terminal, s_)
                # Learn

                if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                    actor_training_started = True

                    for _ in range(train_per_cycle):
                        batch = pool.sample(batch_size)
                        labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn(
                            lr_a_now, lr_c_now, lr_l_now, batch)

                if actor_training_started:
                    current_path['rewards'].append(r)
                    current_path['labda'].append(labda)
                    current_path['critic_error'].append(min(c1_loss, c2_loss))
                    current_path['lyapunov_error'].append(l_loss)
                    current_path['alpha'].append(alpha)

                    current_path['entropy'].append(entropy)
                    current_path['a_loss'].append(a_loss)
                    current_path['disturbance_mag'].append(
                        np.linalg.norm(disturbance))

                if actor_training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                    logger.logkv("total_timesteps", global_step)

                    training_diagnotic = evaluate_training_rollouts(
                        last_actor_training_paths)
                    if training_diagnotic is not None:

                        [
                            logger.logkv(key, training_diagnotic[key])
                            for key in training_diagnotic.keys()
                        ]
                        logger.logkv('lr_a', lr_a_now)
                        logger.logkv('lr_c', lr_c_now)
                        logger.logkv('lr_l', lr_l_now)
                        string_to_print = [
                            'Actor training!time_step:',
                            str(global_step), '|'
                        ]

                        [
                            string_to_print.extend([
                                key, ':',
                                str(round(training_diagnotic[key], 2)), '|'
                            ]) for key in training_diagnotic.keys()
                        ]

                        print(''.join(string_to_print))

                    logger.dumpkvs()
                # 状态更新
                s = s_

                # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
                if done:
                    if actor_training_started:
                        last_actor_training_paths.appendleft(current_path)

                    frac = 1.0 - (global_step - 1.0) / max_global_steps
                    lr_a_now = lr_a * frac  # learning rate for actor
                    lr_c_now = lr_c * frac  # learning rate for critic
                    lr_l_now = lr_l * frac  # learning rate for critic

                    break
        if global_step > max_global_steps:
            break
        for i in range(iter_of_disturber_train):

            current_path = {
                'rewards': [],
                'disturbance_mag': [],
                'd_loss': [],
                'alpha': [],
                'disturber_critic_error': [],
                'entropy': [],
            }

            if global_step > max_global_steps:
                break

            s = env.reset()
            if 'Fetch' in env_name or 'Hand' in env_name:
                s = np.concatenate([s[key] for key in s.keys()])

            for j in range(max_ep_steps):
                if Render:
                    env.render()
                a = policy.choose_action(s, True)
                action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                    a_lowerbound) / 2
                disturbance, raw_disturbance = disturber.choose_action(s, j)
                # Run in simulator
                # disturbance = np.array([0])
                s_, r, done, info = env.step(action, disturbance)
                if 'Fetch' in env_name or 'Hand' in env_name:
                    s_ = np.concatenate([s_[key] for key in s_.keys()])
                    if info['done'] > 0:
                        done = True

                if disturber_training_started:
                    global_step += 1

                if j == max_ep_steps - 1:
                    done = True

                terminal = 1. if done else 0.
                pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_)
                # policy.store_transition(s, a, disturbance, r,0, terminal, s_)
                # Learn

                if pool.memory_pointer > min_memory_size and global_step % disturber_params[
                        'steps_per_cycle'] == 0:
                    disturber_training_started = True

                    for _ in range(disturber_params['train_per_cycle']):
                        batch = pool.sample(disturber_params['batch_size'])
                        d_alpha, d_c1_loss, d_c2_loss, d_entropy, d_loss = disturber.learn(
                            lr_a_now, lr_c_now, batch)
                # d_c1_loss = 0
                # d_c2_loss = 0
                # d_loss=0
                if disturber_training_started:
                    current_path['rewards'].append(r)

                    current_path['disturber_critic_error'].append(
                        min(d_c1_loss, d_c2_loss))
                    current_path['d_loss'].append(d_loss)
                    current_path['alpha'].append(d_alpha)

                    current_path['entropy'].append(d_entropy)

                    current_path['disturbance_mag'].append(
                        np.linalg.norm(disturbance))

                if disturber_training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                    logger.logkv("total_timesteps", global_step)

                    training_diagnotic = evaluate_training_rollouts(
                        last_disturber_training_paths)
                    if training_diagnotic is not None:
                        [
                            logger.logkv(key, training_diagnotic[key])
                            for key in training_diagnotic.keys()
                        ]
                        logger.logkv('lr_a', lr_a_now)
                        logger.logkv('lr_c', lr_c_now)
                        logger.logkv('lr_l', lr_l_now)
                        string_to_print = [
                            'Disturber training!time_step:',
                            str(global_step), '|'
                        ]

                        [
                            string_to_print.extend([
                                key, ':',
                                str(round(training_diagnotic[key], 2)), '|'
                            ]) for key in training_diagnotic.keys()
                        ]

                        print(''.join(string_to_print))

                    logger.dumpkvs()
                # 状态更新
                s = s_

                # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
                if done:
                    if disturber_training_started:
                        last_disturber_training_paths.appendleft(current_path)

                    frac = 1.0 - (global_step - 1.0) / max_global_steps
                    lr_a_now = lr_a * frac  # learning rate for actor
                    lr_c_now = lr_c * frac  # learning rate for critic
                    lr_l_now = lr_l * frac  # learning rate for critic

                    break
        if global_step > max_global_steps:
            break
    policy.save_result(log_path)
    disturber.save_result(log_path)
    print('Running time: ', time.time() - t1)
    return