Exemplo n.º 1
0
        rewards[-1] = rewards[-1] * 10
        dataset.episodes.append(Episode(states, actions, rewards, pb_sas))
    return dataset


if __name__ == '__main__':
    # np.random.seed(RANDOM_SEED)
    s = env.reset()
    n_states = poly.fit_transform(s.reshape(1, -1))
    agent = Agent(n_states.shape[1],
                  env.action_space.n,
                  delta=0.2,
                  sigma=0.01,
                  is_tabular=False)
    agent.c = -np.inf
    mean_return = 0
    did_improve = []
    safety_dataset = generate_dataset(env, agent, 1000)
    candidate_dataset = generate_dataset(env, agent, 1000)
    ngen = 1
    for epoch in range(1000):
        print(f'Epoch: {epoch}')
        print('---------------')

        did_pass = agent.update(safety_dataset,
                                candidate_dataset,
                                1,
                                write=False)

        if did_pass:
Exemplo n.º 2
0
            actions.append(a)
            rewards.append(r)
            pb_sas.append(action_probs[a])
            s = poly.fit_transform(next_state.reshape(1, -1))

        rewards[-1] = rewards[-1] * 10
        dataset.episodes.append(Episode(states, actions, rewards, pb_sas))
    return dataset


if __name__ == '__main__':
    # np.random.seed(RANDOM_SEED)
    s = env.reset()
    n_states = poly.fit_transform(s.reshape(1, -1))
    agent = Agent(n_states.shape[1], env.action_space.n, delta=0.1, sigma=0.1, is_tabular=False)
    agent.c = 0
    mean_return = 0
    did_improve = []
    safety_dataset = generate_dataset(env, agent, 1000)
    candidate_dataset = generate_dataset(env, agent, 1000)
    ngen = 1
    for epoch in range(1000):
        print(f'Epoch: {epoch}')
        print('---------------')

        did_pass = agent.update(safety_dataset, candidate_dataset, 1, write=False)

        if did_pass:
            eval_dataset = generate_dataset(env, agent, 1000)
            gt_estimates = agent.expected_discounted_return(eval_dataset)
            next_mean_return = np.mean(gt_estimates)
Exemplo n.º 3
0
        else:
            success.append(1)
        rewards[-1] = rewards[-1] * 10
        dataset.episodes.append(Episode(states, actions, rewards, pb_sas))
    print(f'Success rate: {np.mean(success)}')
    return dataset


if __name__ == '__main__':
    # np.random.seed(RANDOM_SEED)

    agent = Agent(env.observation_space.n,
                  env.action_space.n,
                  delta=0.25,
                  sigma=0.01)
    agent.c = -10
    mean_return = 0
    did_improve = []
    safety_dataset = generate_dataset(env, agent, 5000)
    candidate_dataset = generate_dataset(env, agent, 5000)
    ngen = 1
    for epoch in range(1000):
        print(f'Epoch: {epoch}')
        print('---------------')

        did_pass = agent.update(safety_dataset,
                                candidate_dataset,
                                1,
                                write=False)

        if did_pass: