rewards[-1] = rewards[-1] * 10 dataset.episodes.append(Episode(states, actions, rewards, pb_sas)) return dataset if __name__ == '__main__': # np.random.seed(RANDOM_SEED) s = env.reset() n_states = poly.fit_transform(s.reshape(1, -1)) agent = Agent(n_states.shape[1], env.action_space.n, delta=0.2, sigma=0.01, is_tabular=False) agent.c = -np.inf mean_return = 0 did_improve = [] safety_dataset = generate_dataset(env, agent, 1000) candidate_dataset = generate_dataset(env, agent, 1000) ngen = 1 for epoch in range(1000): print(f'Epoch: {epoch}') print('---------------') did_pass = agent.update(safety_dataset, candidate_dataset, 1, write=False) if did_pass:
actions.append(a) rewards.append(r) pb_sas.append(action_probs[a]) s = poly.fit_transform(next_state.reshape(1, -1)) rewards[-1] = rewards[-1] * 10 dataset.episodes.append(Episode(states, actions, rewards, pb_sas)) return dataset if __name__ == '__main__': # np.random.seed(RANDOM_SEED) s = env.reset() n_states = poly.fit_transform(s.reshape(1, -1)) agent = Agent(n_states.shape[1], env.action_space.n, delta=0.1, sigma=0.1, is_tabular=False) agent.c = 0 mean_return = 0 did_improve = [] safety_dataset = generate_dataset(env, agent, 1000) candidate_dataset = generate_dataset(env, agent, 1000) ngen = 1 for epoch in range(1000): print(f'Epoch: {epoch}') print('---------------') did_pass = agent.update(safety_dataset, candidate_dataset, 1, write=False) if did_pass: eval_dataset = generate_dataset(env, agent, 1000) gt_estimates = agent.expected_discounted_return(eval_dataset) next_mean_return = np.mean(gt_estimates)
else: success.append(1) rewards[-1] = rewards[-1] * 10 dataset.episodes.append(Episode(states, actions, rewards, pb_sas)) print(f'Success rate: {np.mean(success)}') return dataset if __name__ == '__main__': # np.random.seed(RANDOM_SEED) agent = Agent(env.observation_space.n, env.action_space.n, delta=0.25, sigma=0.01) agent.c = -10 mean_return = 0 did_improve = [] safety_dataset = generate_dataset(env, agent, 5000) candidate_dataset = generate_dataset(env, agent, 5000) ngen = 1 for epoch in range(1000): print(f'Epoch: {epoch}') print('---------------') did_pass = agent.update(safety_dataset, candidate_dataset, 1, write=False) if did_pass: