Пример #1
0
class CartPole:
    def __init__(self, gravity):
        self.dim = 5
        self.env = CartPoleEnv()
        self.env.gravity = gravity

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def action(self, observation, x):
        x = x * 10 - 5
        w = x[:4]
        b = x[4]
        return int(self.sigmoid(np.sum(observation * w) + b) > 0.5)

    def fitness(self, x):
        fitness = 0
        observation = self.env.reset()
        for t in range(200):
            action = self.action(observation, x)
            observation, reward, done, info = self.env.step(action)
            fitness += reward
            if done:
                break
        return -fitness

    def __del__(self):
        self.env.close()
Пример #2
0
    def test_step_return_type(self):
        """Check if a 3-tuple of list, float, and bool is returned"""
        env = CartPoleEnv()
        env.reset()
        obs, reward, done = env.step(action=-1)

        self.assertIsInstance(obs, list)
        self.assertIsInstance(reward, float)
        self.assertIsInstance(done, bool)
Пример #3
0
class TestCartPoleEnv(unittest.TestCase):
    def setUp(self):
        self.env = CartPoleEnv()

    @unittest.skip('Skipping due to cartpole.out')
    def test_reset_return_type(self):
        """Check if a list is returned"""
        self.assertIsInstance(self.env.reset(), list)

    @unittest.skip('Skipping due to cartpole.out')
    def test_step_return_type(self):
        """Check if a 3-tuple of list, float, and bool is returned"""
        env = CartPoleEnv()
        env.reset()
        obs, reward, done = env.step(action=-1)

        self.assertIsInstance(obs, list)
        self.assertIsInstance(reward, float)
        self.assertIsInstance(done, bool)

    @unittest.skip('Skipping due to cartpole.out')
    def test_obs_dim_return_type(self):
        """Check if an integer is returned"""
        self.assertIsInstance(self.env.obs_dim(), int)

    @unittest.skip('Skipping due to cartpole.out')
    def test_reset_return_dims(self):
        """Check if a 4-dimensional list is returned"""
        self.assertEqual(len(self.env.reset()), 4)

    @unittest.skip('Skipping due to cartpole.out')
    def test_obs_dim_return_value(self):
        """Check if 4 is returned"""
        env = CartPoleEnv()
        env.reset()
        self.assertEqual(env.obs_dim(), 4)

    def test_step_wrong_input(self):
        """Check if assertion is raised with wrong input"""
        with self.assertRaises(AssertionError):
            self.env.step(43892.42)

    @unittest.skip('Skipping due to cartpole.out')
    def test_done_signal_per_episode(self):
        """Check if done signal is triggered at the end of the episode"""
        env = CartPoleEnv()
        env.reset()

        for t in range(10):
            _, _, done = env.step(action=-1)
            if t != 499:
                # Must be false within 10 steps
                self.assertFalse(done)

        # Must be true at the end of the episode
        self.assertTrue(done)
Пример #4
0
def main():
    # Build parser
    parser = build_parser()
    options = parser.parse_args()

    # Set random seed
    random.seed(options.random_seed)

    # Get CEM methods
    cem = CrossEntropyMethod(N=options.n, p=options.p)
    # Create environment object
    env = CartPoleEnv()
    # Create linear model
    model = LinearModel(dims=env.obs_dim())

    # Initialize parameters
    params = model.params

    # Episode scores
    win_ratio_list = []

    successful_episodes = 0
    for i_episode in range(options.episodes):
        sys.stderr.write('\n###### Episode {} of {} ###### \n'.format(i_episode+1, options.episodes))

        # Sample N parameter vectors
        noisy_params = cem.sample_parameters(params)
        # Evaluate the sampled vectors
        rewards = [noisy_evaluation(model, env, options.step_size, i) for i in noisy_params]
        # Get elite parameters based on reward
        elite_params = cem.get_elite_parameters(noisy_params,rewards)
        # Update parameters
        params = cem.get_parameter_mean(elite_params)
        episode_reward = run_episode(model=update_model(model,params), env=env, steps=options.step_size, print_step=options.print_step)
        win_ratio = episode_reward / options.step_size
        sys.stderr.write('Episode reward: {} ({:.2f}%)\n'.format(episode_reward, win_ratio))
        # Save win_ratio
        win_ratio_list.append(win_ratio)

        if episode_reward >= options.step_size:
            successful_episodes += 1

    sys.stderr.write('\nFinal params: {}'.format(model.params))
    sys.stderr.write('\nRun finished. {} out of {} episodes ({:.2f}%) have a reward of atleast {}\n'.format(successful_episodes, options.episodes, successful_episodes / options.episodes, options.step_size))

    # If output_file is given, write scores to disk
    if options.output_file:
        sys.stderr.write('\nWriting scores to file: {}.csv...\n'.format(options.output_file))
        with open(options.output_file + '.csv', 'w', newline='') as f:
            wr = csv.writer(f)
            wr.writerow(win_ratio_list)
        sys.stderr.write('Done!\n')

    # Terminate the host program
    env.terminate()
Пример #5
0
    def test_done_signal_per_episode(self):
        """Check if done signal is triggered at the end of the episode"""
        env = CartPoleEnv()
        env.reset()

        for t in range(10):
            _, _, done = env.step(action=-1)
            if t != 499:
                # Must be false within 10 steps
                self.assertFalse(done)

        # Must be true at the end of the episode
        self.assertTrue(done)
Пример #6
0
def evaluate_agent(agent, episodes, return_trajectories=False, seed=1):
    env = CartPoleEnv()
    env.seed(seed)

    returns, trajectories = [], []
    for _ in range(episodes):
        states, actions, rewards = [], [], []
        state, terminal = env.reset(), False
        while not terminal:
            with torch.no_grad():
                policy, _ = agent(state)
                action = policy.logits.argmax(dim=-1)  # Pick action greedily
                state, reward, terminal = env.step(action)

                if return_trajectories:
                    states.append(state)
                    actions.append(action)
                rewards.append(reward)
        returns.append(sum(rewards))
        if return_trajectories:
            # Collect trajectory data (including terminal signal, which may be needed for offline learning)
            terminals = torch.cat(
                [torch.ones(len(rewards) - 1),
                 torch.zeros(1)])
            trajectories.append(
                dict(states=torch.cat(states),
                     actions=torch.cat(actions),
                     rewards=torch.tensor(rewards, dtype=torch.float32),
                     terminals=terminals))

    return (returns, trajectories) if return_trajectories else returns
Пример #7
0
 def __init__(self, gravity):
     self.dim = 5
     self.env = CartPoleEnv()
     self.env.gravity = gravity
Пример #8
0
parser.add_argument('--imitation-epochs',
                    type=int,
                    default=5,
                    metavar='IE',
                    help='Imitation learning epochs')
parser.add_argument('--imitation-replay-size',
                    type=int,
                    default=1,
                    metavar='IRS',
                    help='Imitation learning trajectory replay size')
args = parser.parse_args()
torch.manual_seed(args.seed)
os.makedirs('results', exist_ok=True)

# Set up environment and models
env = CartPoleEnv()
env.seed(args.seed)
agent = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                    args.hidden_size)
agent_optimiser = optim.RMSprop(agent.parameters(), lr=args.learning_rate)
if args.imitation:
    # Set up expert trajectories dataset
    expert_trajectories = torch.load('expert_trajectories.pth')
    expert_trajectories = {
        k: torch.cat([trajectory[k] for trajectory in expert_trajectories],
                     dim=0)
        for k in expert_trajectories[0].keys()
    }  # Flatten expert trajectories
    expert_trajectories = TransitionDataset(expert_trajectories)
    # Set up discriminator
    if args.imitation in ['AIRL', 'GAIL']:
Пример #9
0
 def test_obs_dim_return_value(self):
     """Check if 4 is returned"""
     env = CartPoleEnv()
     env.reset()
     self.assertEqual(env.obs_dim(), 4)
Пример #10
0
 def setUp(self):
     self.env = CartPoleEnv()