Exemplo n.º 1
0
def test(opt, model_dir):
    frame_number = 0
    action_sum = 0
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if torch.cuda.is_available():
        model = torch.load("{}/flappy_bird_2000000".format(model_dir))
    else:
        model = torch.load("{}/flappy_bird".format(opt.saved_path),
                           map_location=lambda storage, loc: storage)
    model.eval()
    game_state = FlappyBird()
    image, reward, terminal, score = game_state.next_frame(0)
    image = pre_processing(
        image[:game_state.screen_width, :int(game_state.base_y)],
        opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]
    flag = True
    while flag:
        frame_number += 1
        prediction = model(state)[0]
        action = torch.argmax(prediction).item()
        action_sum += action
        next_image, reward, terminal, score = game_state.next_frame(action)
        if terminal or score == 50:
            flag = False
            break
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(game_state.base_y)],
            opt.image_size, opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]

        state = next_state

    return score, frame_number, action_sum
Exemplo n.º 2
0
def test(opt, testname, result):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if torch.cuda.is_available():
        model = torch.load(('{}/' + testname).format(opt.saved_path))
    else:
        model = torch.load("{}/ns_shaft".format(opt.saved_path),
                           map_location=lambda storage, loc: storage)
    model.eval()
    game_state = NS_SHAFT()
    image, reward, terminal = game_state.next_frame(0)
    image = pre_processing(image[:game_state.screen_width, :int(400)],
                           opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]
    total_score = 0
    while terminal == False:
        prediction = model(state)
        action = (prediction.data.max(1)[1].item() - 1) * 4

        next_image, reward, terminal = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(400)], opt.image_size,
            opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]
        total_score += 1
        state = next_state
    result.append(total_score)
    print(total_score)
Exemplo n.º 3
0
def test(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if torch.cuda.is_available():
        model = torch.load("{}/flappy_bird".format(opt.saved_path))
    else:
        model = torch.load("{}/flappy_bird".format(opt.saved_path),
                           map_location=lambda storage, loc: storage)
    model.eval()
    game_state = FlappyBird()
    image, reward, terminal = game_state.next_frame(0)
    image = pre_processing(
        image[:game_state.screen_width, :int(game_state.base_y)],
        opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]

    while True:
        prediction = model(state)[0]
        action = torch.argmax(prediction)[0]

        next_image, reward, terminal = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(game_state.base_y)],
            opt.image_size, opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]

        state = next_state
Exemplo n.º 4
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    model = DeepQNetwork()
    model_target = DeepQNetwork()
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
    criterion = nn.MSELoss()
    game_state = FlappyBird()
    image, reward, terminal, score = game_state.next_frame(0)
    image = pre_processing(
        image[:game_state.screen_width, :int(game_state.base_y)],
        opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        model_target.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]
    model_target.eval()
    replay_memory = []
    iter = 0
    while iter < opt.num_iters:
        prediction = model(state)[0]
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            (opt.num_iters - iter) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters)
        u = random()
        random_action = u <= epsilon
        if random_action:
            #print("Perform a random action")
            action = randint(0, 1)
        else:
            action = torch.argmax(prediction).item()

        next_image, reward, terminal, score = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(game_state.base_y)],
            opt.image_size, opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]
        replay_memory.append([state, action, reward, next_state, terminal])
        if len(replay_memory) > opt.replay_memory_size:
            del replay_memory[0]
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip(
            *batch)

        state_batch = torch.cat(tuple(state for state in state_batch))
        action_batch = torch.from_numpy(
            np.array([[1, 0] if action == 0 else [0, 1]
                      for action in action_batch],
                     dtype=np.float32))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.cat(tuple(state
                                           for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()
        current_prediction_batch = model(state_batch)
        next_prediction_batch = model_target(next_state_batch)

        y_batch = torch.cat(
            tuple(reward if terminal else reward +
                  opt.gamma * prediction[max_action]
                  for reward, terminal, prediction, max_action in zip(
                      reward_batch, terminal_batch, next_prediction_batch,
                      torch.argmax(model(next_state_batch), axis=1))))

        q_value = torch.sum(current_prediction_batch * action_batch, dim=1)
        optimizer.zero_grad()
        # y_batch = y_batch.detach()
        loss = criterion(q_value, y_batch)
        loss.backward()
        optimizer.step()

        state = next_state

        if iter % opt.target_update_freq == 0:
            model_target.load_state_dict(model.state_dict())

        iter += 1
        if iter % 100 == 0:
            print(
                "Test::Double Q: Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}"
                .format(iter + 1, opt.num_iters, action, loss, epsilon, reward,
                        torch.max(prediction)))
        writer.add_scalar('Train/Loss', loss, iter)
        writer.add_scalar('Train/Epsilon', epsilon, iter)
        writer.add_scalar('Train/Reward', reward, iter)
        writer.add_scalar('Train/Q-value', torch.max(prediction), iter)
        writer.add_scalar('Train/score', score, iter)
        if (iter + 1) % 1000000 == 0:
            torch.save(model,
                       "{}/flappy_bird_{}".format(opt.saved_path, iter + 1))
    torch.save(model, "{}/flappy_bird".format(opt.saved_path))
Exemplo n.º 5
0
def train(opt):
    # Set random seed
    if torch.cuda.is_available():
        torch.cuda.manual_seed(opt.random_seed)
    else:
        torch.manual_seed(opt.random_seed)
    # Instantiate the model
    if opt.conv_dim is not None and \
       opt.conv_kernel_sizes is not None and \
       opt.conv_strides is not None and \
       opt.fc_dim is not None:
        model = DeepQNetwork(opt.image_size,
                             opt.image_size,
                             conv_dim=opt.conv_dim,
                             conv_kernel_sizes=opt.conv_kernel_sizes,
                             conv_strides=opt.conv_strides,
                             fc_dim=opt.fc_dim)
    else:
        model = DeepQNetwork(opt.image_size, opt.image_size)

    if opt.log_comet_ml:
        # Create a Comet.ml experiment
        experiment = Experiment(api_key=opt.comet_ml_api_key,
                                project_name=opt.comet_ml_project_name,
                                workspace=opt.comet_ml_workspace)
        experiment.log_other("iters_to_save", opt.iters_to_save)
        experiment.log_other("completed", False)
        experiment.log_other("random_seed", opt.random_seed)

        # Report hyperparameters to Comet.ml
        hyper_params = {
            "image_size": opt.image_size,
            "batch_size": opt.batch_size,
            "optimizer": opt.optimizer,
            "learning_rate": opt.lr,
            "gamma": opt.gamma,
            "initial_epsilon": opt.initial_epsilon,
            "final_epsilon": opt.final_epsilon,
            "num_iters": opt.num_iters,
            "replay_memory_size": opt.replay_memory_size,
            "random_seed": opt.random_seed,
            "conv_dim": opt.conv_dim,
            "conv_kernel_sizes": opt.conv_kernel_sizes,
            "conv_strides": opt.conv_strides,
            "fc_dim": opt.fc_dim
        }
        experiment.log_parameters(hyper_params)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-6)  # Optimization algorithm
    criterion = nn.MSELoss()  # Loss function
    game_state = FlappyBird()  # Instantiate the Flappy Compass game
    image, reward, terminal = game_state.next_frame(
        0
    )  # Get the next image, along with its reward and an indication if it's a terminal state

    # Image preprocessing step (scaling, color removal and convertion to a PyTorch tensor)
    image = pre_processing(
        image[:game_state.screen_width, :int(game_state.base_y)],
        opt.image_size, opt.image_size)
    image = torch.from_numpy(image)

    # Move the model and the current image data to the GPU, if available
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()

    # Prepare the state variable, which will host the last 4 frames
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]

    # Initialize the replay memory, which saves sets of consecutive game states, the reward and terminal state indicator
    # so that the model can learn from them (essentially constitutes the training data, which grows with every new iteration)
    replay_memory = []

    iter = 0  # Iteration counter

    # Main training loop performing the number of iterations specified by num_iters
    while iter < opt.num_iters:
        prediction = model(state)[0]  # Get a prediction from the current state
        epsilon = opt.final_epsilon + (
            (opt.num_iters - iter) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters
        )  # Set the decay of the probability of random actions
        u = random()
        random_action = u <= epsilon
        if random_action:
            print("Perform a random action")
            action = randint(0, 1)
        else:
            # Use the model's prediction to decide the next action
            action = torch.argmax(prediction).item()

        # Get a new frame and process it
        next_image, reward, terminal = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(game_state.base_y)],
            opt.image_size, opt.image_size)
        next_image = torch.from_numpy(next_image)

        # Move the next image data to the GPU, if available
        if torch.cuda.is_available():
            next_image = next_image.cuda()

        next_state = torch.cat(
            (state[0, 1:, :, :], next_image)
        )[None, :, :, :]  # Prepare the next state variable, which will host the last 4 frames
        replay_memory.append(
            [state, action, reward, next_state, terminal]
        )  # Save the current state, action, next state and terminal state indicator in the replay memory
        if len(replay_memory) > opt.replay_memory_size:
            del replay_memory[
                0]  # Delete the oldest reolay from memory if full capacity has been reached
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)
                       )  # Retrieve past play sequences from the replay memory
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip(
            *batch)

        state_batch = torch.cat(tuple(
            state for state in state_batch))  # States of the current batch
        action_batch = torch.from_numpy(
            np.array([[1, 0] if action == 0 else [0, 1]
                      for action in action_batch],
                     dtype=np.float32))  # Actions taken in the current batch
        reward_batch = torch.from_numpy(
            np.array(reward_batch,
                     dtype=np.float32)[:,
                                       None])  # Rewards in the current batch
        next_state_batch = torch.cat(tuple(
            state
            for state in next_state_batch))  # Next states of the current batch

        # Move batch data to the GPU, if available
        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        current_prediction_batch = model(
            state_batch
        )  # Predictions of the model for the replays of the current batch
        next_prediction_batch = model(
            next_state_batch
        )  # Next predictions of the model for the replays of the current batch

        # Set ground truth for the rewards for the current batch, considering whether the state is terminal or not
        y_batch = torch.cat(
            tuple(reward if terminal else reward +
                  opt.gamma * torch.max(prediction)
                  for reward, terminal, prediction in zip(
                      reward_batch, terminal_batch, next_prediction_batch)))

        q_value = torch.sum(
            current_prediction_batch * action_batch, dim=1
        )  # Predicted Q values (i.e. estimated return for each action)
        optimizer.zero_grad(
        )  # Reset the gradients to zero before a new optimization step
        loss = criterion(q_value, y_batch)  # Calculate the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Weights optimization step

        state = next_state  # Move to the next frame
        iter += 1
        print(
            "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}"
            .format(iter + 1, opt.num_iters, action, loss, epsilon, reward,
                    torch.max(prediction)))

        if opt.log_comet_ml:
            # Log metrics to Comet.ml
            experiment.log_metric("train_loss", loss, step=iter)
            experiment.log_metric("train_epsilon", epsilon, step=iter)
            experiment.log_metric("train_reward", reward, step=iter)
            experiment.log_metric("train_Q_value",
                                  torch.max(prediction),
                                  step=iter)

        if (iter + 1) % opt.iters_to_save == 0:
            # Get the current day and time to attach to the saved model's name
            current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M')

            # Set saved model name
            model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth'

            # Save model every iters_to_save iterations
            torch.save(model, model_filename)

            if opt.log_comet_ml and opt.comet_ml_save_model:
                # Upload model to Comet.ml
                experiment.log_asset(file_path=model_filename, overwrite=True)

    # Get the current day and time to attach to the saved model's name
    current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M')

    # Set saved model name
    model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth'

    # Save the model after reaching the final iteration
    torch.save(model, model_filename)

    if opt.log_comet_ml:
        # Only report that the experiment completed successfully if it finished the training without errors
        experiment.log_other("completed", True)

        if opt.comet_ml_save_model:
            # Upload model to Comet.ml
            experiment.log_asset(file_path=model_filename, overwrite=True)
Exemplo n.º 6
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    model = DeepQNetwork()
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
    criterion = nn.MSELoss()
    game_state = NS_SHAFT()
    image, reward, terminal = game_state.next_frame(0)
    imgplot = plt.imshow(image)
    image = pre_processing(image[:game_state.screen_width, :int(400)],
                           opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]

    replay_memory = []
    iter = 0
    while iter < opt.num_iters:
        #plt.plot([1,2,3,4])

        prediction = model(state)
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            (opt.num_iters - iter) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters)
        u = random()
        random_action = u <= epsilon
        if random_action:
            print("Perform a random action")
            action = randint(-1, 1) * 4
        else:
            #print('prediction',prediction)
            action = (prediction.data.max(1)[1].item() - 1) * 4
            #print('a',action)

        next_image, reward, terminal = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(400)], opt.image_size,
            opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]
        replay_memory.append([state, action, reward, next_state, terminal])
        if len(replay_memory) > opt.replay_memory_size:
            del replay_memory[0]
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip(
            *batch)

        state_batch = torch.cat(tuple(state for state in state_batch))
        action_batch_tmp = []
        for action in action_batch:
            if action == -4:
                action_batch_tmp.append([1, 0, 0])
            elif action == 0:
                action_batch_tmp.append([0, 1, 0])
            else:
                action_batch_tmp.append([0, 0, 1])
        action_batch = torch.from_numpy(
            np.array(action_batch_tmp, dtype=np.float32))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.cat(tuple(state
                                           for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()
        current_prediction_batch = model(state_batch)
        next_prediction_batch = model(next_state_batch)

        y_batch = torch.cat(
            tuple(reward if terminal else reward +
                  opt.gamma * torch.max(prediction)
                  for reward, terminal, prediction in zip(
                      reward_batch, terminal_batch, next_prediction_batch)))

        q_value = torch.sum(current_prediction_batch * action_batch, dim=1)
        optimizer.zero_grad()
        # y_batch = y_batch.detach()
        loss = criterion(q_value, y_batch)
        loss.backward()
        optimizer.step()

        state = next_state
        iter += 1

        print(
            "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}"
            .format(iter + 1, opt.num_iters, action, loss, epsilon, reward,
                    torch.max(prediction)))
        writer.add_scalar('Train/Loss', loss, iter)
        writer.add_scalar('Train/Epsilon', epsilon, iter)
        writer.add_scalar('Train/Reward', reward, iter)
        writer.add_scalar('Train/Q-value', torch.max(prediction), iter)
        if iter + 1 == opt.num_iters:
            torch.save(model, "{}/ns_shaft_{}".format(opt.saved_path,
                                                      iter + 1))
    torch.save(model, "{}/ns_shaft".format(opt.saved_path))