예제 #1
0
def train_snake():

    # todo: put all these parameters using a configuration file
    numberOfCells = 10  # in each axis
    startingPosition = (4, 5)  # head
    foodPosition = (3, 6)
    max_steps_allowed = 1000

    env = Environment(numberOfCells)
    state_size = env.state_size  #(numberOfCells x numberOfCells)
    action_size = Actions.action_size  # 3
    agent = DQNAgent(state_size=state_size,
                     action_size=action_size,
                     batch_size=32,
                     memory_limit=6000,
                     number_of_channels=5)

    episodes = 30000
    decay = 0.9 / episodes * 2  # changes epsilon : explore vs exploit

    epochs = []
    losses = []
    steps_list = []

    with open('training_data', 'w') as f:

        for e in range(episodes):

            state = env.reset(startingPosition)
            #print('state array reset: \n', state)

            agent.reset_convolutional_layers()
            full_state = agent.get_convolutional_layers(state)
            loss = 0.0
            steps = 0
            done = False
            episode_reward = 0

            while not done:

                # state at this point is just a 2D array
                action = agent.get_action(full_state)
                #action = agent.get_raction()
                #print('action chosen: ', action)

                # step onto the next state
                next_state, reward, done = env.step(action)

                #print('state array after step ', steps, ' : \n', next_state)
                #print('reward returned: ', reward)
                #print('next state: ', next_state)

                # we store the next_state in (1,H,W,C)
                full_next_state = agent.get_convolutional_layers(next_state)
                #print('full next state: \n:', full_next_state)
                #assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers))

                # save S,A,R,S' to experience
                # full states are a snapshot - copies of the state
                agent.save_transition(full_state, action, reward,
                                      full_next_state, done)
                episode_reward += reward

                # use alternative policy to train model - rely on experience only
                current_loss = agent.train()
                #print('current_loss: ', current_loss)
                loss += current_loss
                full_state = full_next_state

                # limit max steps - avoid something bad
                steps += 1
                if steps >= max_steps_allowed:
                    done = True

            # next episode
            if agent.epsilon > 0.1:
                agent.epsilon -= decay  # agent slowly reduces exploring

            print(
                'episode: {:5d} steps: {:3d} epsilon: {:.3f} loss: {:8.4f} reward: {:3d} fruits: {:2d}'
                .format(e, steps, agent.epsilon, loss, episode_reward,
                        env.fruits_eaten))
            f.write('{:5d} {:3d} {:8.4f} {:4d} {:2d}\n'.format(
                e, steps, loss, episode_reward, env.fruits_eaten))

        agent.model.save('trained_snake.model')
예제 #2
0
파일: train.py 프로젝트: zalzala/rl-intro
n_actions = environment.action_space.n
n_state_features = environment.observation_space.shape[0]

# Initialize DQN agent
agent = DQNAgent(n_state_features, n_actions)

for episode in range(EPISODES):

    state = environment.reset()
    state = np.reshape(state, [1, n_state_features])

    for t in range(EPISODE_LENGTH):

        # Predict next action using NN Value Function Approximation
        action = agent.get_action(state)

        # Interact with the environment and observe new state and reward
        next_state, reward, terminated, info = environment.step(action)

        # Huge negative reward if failed
        if terminated:
            reward = -100

        # Remember agent's experience: state / action / reward / next state
        next_state = np.reshape(next_state, [1, n_state_features])
        agent.remember(state, action, reward, next_state, terminated)

        # Change the current state
        state = next_state