Python GameState示例，game.flappy_bird.GameState Python示例

示例#1

0

显示文件

文件： dqn.py 项目： lmadhuranga/q_learning

def test(model):
    game_state = GameState()

    # initial action is do nothing
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    image_data, reward, terminal = game_state.frame_step(action)
    image_data = resize_and_bgr2gray(image_data)
    image_data = image_to_tensor(image_data)
    state = torch.cat(
        (image_data, image_data, image_data, image_data)).unsqueeze(0)

    while True:
        # get output from the neural network
        output = model(state)[0]

        action = torch.zeros([model.number_of_actions], dtype=torch.float32)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action = action.cuda()

        # get action
        action_index = torch.argmax(output)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action_index = action_index.cuda()
        action[action_index] = 1

        # get next state
        image_data_1, reward, terminal = game_state.frame_step(action)
        image_data_1 = resize_and_bgr2gray(image_data_1)
        image_data_1 = image_to_tensor(image_data_1)
        state_1 = torch.cat(
            (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0)

        # set state to be state_1
        state = state_1

示例#2

0

显示文件

文件： dqn.py 项目： ZedZero/dqn_flappy_bird

def test(model):
    game_state = GameState()

    # initial action is do nothing
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    image_data, reward, terminal = game_state.frame_step(action)
    image_data = resize_and_bgr2gray(image_data)
    image_data = image_to_tensor(image_data)
    state = torch.cat((image_data, image_data, image_data, image_data)).unsqueeze(0)

    while True:
        # get output from the neural network
        output = model(state)[0]

        action = torch.zeros([model.number_of_actions], dtype=torch.float32)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action = action.cuda()

        # get action
        action_index = torch.argmax(output)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action_index = action_index.cuda()
        action[action_index] = 1

        # get next state
        image_data_1, reward, terminal = game_state.frame_step(action)
        image_data_1 = resize_and_bgr2gray(image_data_1)
        image_data_1 = image_to_tensor(image_data_1)
        state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0)

        # set state to be state_1
        state = state_1

示例#3

0

显示文件

文件： flappy.py 项目： vishal3410/flappy-bird-playing-agent

def test(model):
    game_state = GameState()

    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    img_data, reward, done = game_state.frame_step(action)
    img_data = preprocess(img_data)
    img_data = convert_img_to_tensor(img_data)
    state = torch.cat((img_data, img_data, img_data, img_data)).unsqueeze(0)

    while True:

        output = model(state)[0]
        action = torch.zeros([model.number_of_actions], dtype=torch.float32)

        action_index = torch.argmax(output)
        action[action_index] = 1

        next_img_data, reward, done = game_state.frame_step(action)
        next_img_data = preprocess(next_img_data)
        next_img_data = convert_img_to_tensor(img_data)
        next_state = torch.cat(
            (state.squeeze(0)[1:, :, :], next_img_data)).unsqueeze(0)

        state = next_state

示例#4

0

显示文件

文件： eval.py 项目： banben/learn-rl

def test(model):
    game_state = GameState()

    # initial action is do nothing
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    image_data, reward, terminal = game_state.frame_step(action)
    image_data = resize_and_bgr2gray(image_data)
    image_data = image_to_tensor(image_data)
    state = torch.cat(
        (image_data, image_data, image_data, image_data)).unsqueeze(0)

    epsilon = 0
    total_reward = 0
    max_reward = 0
    cur_reward = 0
    rewards = []

    while True:
        if epsilon >= 10:
            break

        # get output from the neural network
        output = model(state)[0]

        action = torch.zeros([model.number_of_actions], dtype=torch.float32)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action = action.cuda()

        # get action
        action_index = torch.argmax(output)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action_index = action_index.cuda()
        action[action_index] = 1

        # get next state
        image_data_1, reward, terminal = game_state.frame_step(action)
        if terminal:
            epsilon += 1
            if cur_reward > max_reward:
                max_reward = cur_reward
            rewards.append(cur_reward)
            cur_reward = 0
        if reward > 0.1:
            total_reward += reward
            cur_reward += reward
        image_data_1 = resize_and_bgr2gray(image_data_1)
        image_data_1 = image_to_tensor(image_data_1)
        state_1 = torch.cat(
            (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0)

        # set state to be state_1
        state = state_1

    print('reward:', total_reward / 10.0)
    print('max reward:', max_reward)
    print('standard deviation:', np.std(rewards, axis=0))

示例#5

0

显示文件

def test(model):
    game_state = GameState()

    # initial action is do nothing
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    image_data, reward, terminal = game_state.frame_step(action)
    image_origin = resize_and_bgr(image_data)
    image_data = resize_and_bgr2gray(image_data)
    image_data = image_to_tensor(image_data)
    state = torch.cat((image_data, image_data, image_data, image_data)).unsqueeze(0)

    while True:
        # get output from the neural network
        output, conv_output = model(state)
        heatmap = np.mean(np.mean(conv_output.detach().numpy(), axis=0), axis=0)
        heatmap = np.maximum(heatmap, 0)
        heatmap /= np.max(heatmap)
        heatmap = cv2.resize(heatmap, (image_origin.shape[1], image_origin.shape[0]))
        heatmap = np.fliplr(rotate(heatmap, 90*3))

        image_origin = np.fliplr(rotate(image_origin, 90*3))
        plt.imshow(image_origin)
        plt.imshow(heatmap, cmap=plt.cm.jet, alpha=0.5, interpolation='nearest', vmin=0, vmax=1)
        plt.show()

        action = torch.zeros([model.number_of_actions], dtype=torch.float32)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action = action.cuda()

        # get action
        action_index = torch.argmax(output)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action_index = action_index.cuda()
        action[action_index] = 1

        # get next state
        image_data_1, reward, terminal = game_state.frame_step(action)
        image_origin = resize_and_bgr(image_data_1)
        image_data_1 = resize_and_bgr2gray(image_data_1)
        image_data_1 = image_to_tensor(image_data_1)
        state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0)

        # set state to be state_1
        state = state_1

示例#6

0

显示文件

文件： dqn.py 项目： lmadhuranga/q_learning

def train(model, start):
    # define Adam optimizer
    optimizer = optim.Adam(model.parameters(), lr=1e-6)

    # initialize mean squared error loss
    criterion = nn.MSELoss()

    # instantiate game
    game_state = GameState()

    # initialize replay memory
    replay_memory = []

    # initial action is do nothing
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    image_data, reward, terminal = game_state.frame_step(action)
    image_data = resize_and_bgr2gray(image_data)
    image_data = image_to_tensor(image_data)
    state = torch.cat(
        (image_data, image_data, image_data, image_data)).unsqueeze(0)

    # initialize epsilon value
    epsilon = model.initial_epsilon
    iteration = 0

    epsilon_decrements = np.linspace(model.initial_epsilon,
                                     model.final_epsilon,
                                     model.number_of_iterations)

    # main infinite loop
    while iteration < model.number_of_iterations:
        # get output from the neural network
        output = model(state)[0]

        # initialize action
        action = torch.zeros([model.number_of_actions], dtype=torch.float32)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action = action.cuda()

        # epsilon greedy exploration
        random_action = random.random() <= epsilon
        if random_action:
            print("Performed random action!")
        action_index = [
            torch.randint(
                model.number_of_actions, torch.Size([]), dtype=torch.int)
            if random_action else torch.argmax(output)
        ][0]

        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action_index = action_index.cuda()

        action[action_index] = 1

        # get next state and reward
        image_data_1, reward, terminal = game_state.frame_step(action)
        image_data_1 = resize_and_bgr2gray(image_data_1)
        image_data_1 = image_to_tensor(image_data_1)
        state_1 = torch.cat(
            (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0)

        action = action.unsqueeze(0)
        reward = torch.from_numpy(np.array([reward],
                                           dtype=np.float32)).unsqueeze(0)

        # save transition to replay memory
        replay_memory.append((state, action, reward, state_1, terminal))

        # if replay memory is full, remove the oldest transition
        if len(replay_memory) > model.replay_memory_size:
            replay_memory.pop(0)

        # epsilon annealing
        epsilon = epsilon_decrements[iteration]

        # sample random minibatch
        minibatch = random.sample(
            replay_memory, min(len(replay_memory), model.minibatch_size))

        # unpack minibatch
        state_batch = torch.cat(tuple(d[0] for d in minibatch))
        action_batch = torch.cat(tuple(d[1] for d in minibatch))
        reward_batch = torch.cat(tuple(d[2] for d in minibatch))
        state_1_batch = torch.cat(tuple(d[3] for d in minibatch))

        if torch.cuda.is_available():  # put on GPU if CUDA is available
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            state_1_batch = state_1_batch.cuda()

        # get output for the next state
        output_1_batch = model(state_1_batch)

        # set y_j to r_j for terminal state, otherwise to r_j + gamma*max(Q)
        y_batch = torch.cat(
            tuple(reward_batch[i] if minibatch[i][4] else reward_batch[i] +
                  model.gamma * torch.max(output_1_batch[i])
                  for i in range(len(minibatch))))

        # extract Q-value
        q_value = torch.sum(model(state_batch) * action_batch, dim=1)

        # PyTorch accumulates gradients by default, so they need to be reset in each pass
        optimizer.zero_grad()

        # returns a new Tensor, detached from the current graph, the result will never require gradient
        y_batch = y_batch.detach()

        # calculate loss
        loss = criterion(q_value, y_batch)

        # do backward pass
        loss.backward()
        optimizer.step()

        # set state to be state_1
        state = state_1
        iteration += 1

        if iteration % 25000 == 0:
            torch.save(
                model,
                "pretrained_model/current_model_" + str(iteration) + ".pth")

        print("iteration:", iteration, "elapsed time:",
              time.time() - start, "epsilon:", epsilon, "action:",
              action_index.cpu().detach().numpy(), "reward:",
              reward.numpy()[0][0], "Q max:",
              np.max(output.cpu().detach().numpy()))

示例#7

0

显示文件

def train():
    env = GameState()

    # num_inputs = env.observation_space.shape[0]
    num_inputs = 3136
    num_actions = 2
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = DRQN(num_inputs, num_actions)
    target_net = DRQN(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    if torch.cuda.is_available():  # put on GPU if CUDA is available
        online_net = online_net.cuda()
        target_net = target_net.cuda()

    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    epsilon = 1.0
    loss = 0
    iteration = 0

    while iteration < 2000000:
        done = False

        action = torch.zeros([2], dtype=torch.float32)
        action[0] = 1
        image_data, reward, done = env.frame_step(action)
        image_data = resize_and_bgr2gray(image_data)
        image_data = image_to_tensor(image_data)
        state = image_data
        state = torch.Tensor(state)
        if torch.cuda.is_available():
            state = state.cuda()

        hidden = None

        while not done:

            action, hidden, action_index = get_action(state, target_net, epsilon, env, hidden)
            image_data, reward, done = env.frame_step(action)
            image_data = resize_and_bgr2gray(image_data)
            image_data = image_to_tensor(image_data)

            next_state = image_data
            next_state = torch.Tensor(next_state)
            if torch.cuda.is_available():
                next_state = next_state.cuda()

            mask = 0 if done else 1
            reward = reward if not done else -1

            memory.push(state, next_state, action_index, reward, mask)

            state = next_state
            
            if iteration > initial_exploration and len(memory) > batch_size:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = DRQN.train_model(online_net, target_net, optimizer, batch)

                if iteration % update_target == 0:
                    print('iteration: {}, update model'.format(iteration))
                    update_target_model(online_net, target_net)

            iteration += 1

            if iteration % 25000 == 0:
                torch.save(online_net, "pretrained_model/current_model_" + str(iteration) + ".pth")

            print('iteration: {}'.format(iteration))

示例#8

0

显示文件

def test():
    cuda_is_available = torch.cuda.is_available()

    env = GameState()

    # num_inputs = env.observation_space.shape[0]
    num_inputs = 3136
    num_actions = 2
    print('state size:', num_inputs)
    print('action size:', num_actions)

    model = torch.load(
                'pretrained_model/current_model_2000000.pth',
                map_location='cpu' if not cuda_is_available else None
            ).eval()

    if torch.cuda.is_available():  # put on GPU if CUDA is available
        model = model.cuda()
    
    action = torch.zeros([2], dtype=torch.float32)
    action[0] = 1
    image_data, reward, done = env.frame_step(action)
    image_data = resize_and_bgr2gray(image_data)
    image_data = image_to_tensor(image_data)
    state = image_data
    state = torch.Tensor(state)
    if torch.cuda.is_available():
        state = state.cuda()

    hidden = None

    epsilon = 0
    total_reward = 0
    max_reward = 0
    cur_reward = 0
    rewards = []

    while True:
        if epsilon >= 10:
            break

        action, hidden, action_index = get_action(state, model, 0, env, hidden)
        image_data, reward, done = env.frame_step(action)
        image_data = resize_and_bgr2gray(image_data)
        image_data = image_to_tensor(image_data)

        next_state = image_data
        next_state = torch.Tensor(next_state)
        if torch.cuda.is_available():
            next_state = next_state.cuda()

        state = next_state

        if done:
            epsilon += 1
            if cur_reward > max_reward:
                max_reward = cur_reward
            rewards.append(cur_reward)
            cur_reward = 0
        if reward > 0.1:
            total_reward += reward
            cur_reward += reward

    print('reward:', total_reward/10.0)
    print('max reward:', max_reward)
    print('standard deviation:', np.std(rewards, axis=0))

示例#9

0

显示文件

文件： dqn.py 项目： ZedZero/dqn_flappy_bird

def train(model, start):
    # define Adam optimizer
    optimizer = optim.Adam(model.parameters(), lr=1e-6)

    # initialize mean squared error loss
    criterion = nn.MSELoss()

    # instantiate game
    game_state = GameState()

    # initialize replay memory
    replay_memory = []

    # initial action is do nothing
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    image_data, reward, terminal = game_state.frame_step(action)
    image_data = resize_and_bgr2gray(image_data)
    image_data = image_to_tensor(image_data)
    state = torch.cat((image_data, image_data, image_data, image_data)).unsqueeze(0)

    # initialize epsilon value
    epsilon = model.initial_epsilon
    iteration = 0

    epsilon_decrements = np.linspace(model.initial_epsilon, model.final_epsilon, model.number_of_iterations)

    # main infinite loop
    while iteration < model.number_of_iterations:
        # get output from the neural network
        output = model(state)[0]

        # initialize action
        action = torch.zeros([model.number_of_actions], dtype=torch.float32)
        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action = action.cuda()

        # epsilon greedy exploration
        random_action = random.random() <= epsilon
        if random_action:
            print("Performed random action!")
        action_index = [torch.randint(model.number_of_actions, torch.Size([]), dtype=torch.int)
                        if random_action
                        else torch.argmax(output)][0]

        if torch.cuda.is_available():  # put on GPU if CUDA is available
            action_index = action_index.cuda()

        action[action_index] = 1

        # get next state and reward
        image_data_1, reward, terminal = game_state.frame_step(action)
        image_data_1 = resize_and_bgr2gray(image_data_1)
        image_data_1 = image_to_tensor(image_data_1)
        state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0)

        action = action.unsqueeze(0)
        reward = torch.from_numpy(np.array([reward], dtype=np.float32)).unsqueeze(0)

        # save transition to replay memory
        replay_memory.append((state, action, reward, state_1, terminal))

        # if replay memory is full, remove the oldest transition
        if len(replay_memory) > model.replay_memory_size:
            replay_memory.pop(0)

        # epsilon annealing
        epsilon = epsilon_decrements[iteration]

        # sample random minibatch
        minibatch = random.sample(replay_memory, min(len(replay_memory), model.minibatch_size))

        # unpack minibatch
        state_batch = torch.cat(tuple(d[0] for d in minibatch))
        action_batch = torch.cat(tuple(d[1] for d in minibatch))
        reward_batch = torch.cat(tuple(d[2] for d in minibatch))
        state_1_batch = torch.cat(tuple(d[3] for d in minibatch))

        if torch.cuda.is_available():  # put on GPU if CUDA is available
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            state_1_batch = state_1_batch.cuda()

        # get output for the next state
        output_1_batch = model(state_1_batch)

        # set y_j to r_j for terminal state, otherwise to r_j + gamma*max(Q)
        y_batch = torch.cat(tuple(reward_batch[i] if minibatch[i][4]
                                  else reward_batch[i] + model.gamma * torch.max(output_1_batch[i])
                                  for i in range(len(minibatch))))

        # extract Q-value
        q_value = torch.sum(model(state_batch) * action_batch, dim=1)

        # PyTorch accumulates gradients by default, so they need to be reset in each pass
        optimizer.zero_grad()

        # returns a new Tensor, detached from the current graph, the result will never require gradient
        y_batch = y_batch.detach()

        # calculate loss
        loss = criterion(q_value, y_batch)

        # do backward pass
        loss.backward()
        optimizer.step()

        # set state to be state_1
        state = state_1
        iteration += 1

        if iteration % 25000 == 0:
            torch.save(model, "pretrained_model/current_model_" + str(iteration) + ".pth")

        print("iteration:", iteration, "elapsed time:", time.time() - start, "epsilon:", epsilon, "action:",
              action_index.cpu().detach().numpy(), "reward:", reward.numpy()[0][0], "Q max:",
              np.max(output.cpu().detach().numpy()))

示例#10

0

显示文件

文件： AgentBird.py 项目： JinHuiXu1991/DataScienceProjects

def train(model):
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    # instantiate game
    game_state = GameState()
    agentBird = AgentBird(ACTION_NUM, model)
    # initial action is do nothing
    # [1, 0] represents "Do nothing"
    # [0, 1] represents "Fly up"
    action = torch.zeros([ACTION_NUM], dtype=torch.float32)
    action[0] = 1
    state_image, reward, terminal, state_score = game_state.frame_step(action)
    image = preprocess_image(state_image)
    state = torch.cat((image, image, image, image)).unsqueeze(0)

    run = 1
    i = 0
    while True:
        #for i in range(NUM_ITERATIONS):
        # Select and perform an action
        action_idx, q_values = agentBird.predict_action(state)
        action = torch.zeros([ACTION_NUM], dtype=torch.float32)
        action[action_idx] = 1

        # get next state and reward
        state_image_1, reward, terminal, state_score = game_state.frame_step(
            action)
        image_1 = preprocess_image(state_image_1)
        state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_1)).unsqueeze(0)

        score = reward
        reward = torch.from_numpy(np.array([reward],
                                           dtype=np.float32)).unsqueeze(0)
        reward = torch.tensor([reward])

        reward = reward.to(device)

        action = action.unsqueeze(0)
        action = action.to(device)

        # Store the transition in memory
        agentBird.add_memory(state, action, reward, state_1, terminal)
        agentBird.update_exploration_rate(i)

        # Perform one step of the optimization (on the target network)
        loss = agentBird.experience_replay()

        print("iteration " + str(i) + ", exploration: " +
              str(agentBird.exploration_rate) + ", Q max:" +
              str(np.max(q_values.cpu().detach().numpy())) + ", action:" +
              str(action_idx) + ", reward:" + str(score))
        writer.add_scalar('Q_value', np.max(q_values.cpu().detach().numpy()),
                          i)
        writer.add_scalar('loss', float(loss), i)

        # Move to the next state
        state = state_1
        i += 1

        if terminal:
            run += 1
            print("episode " + str(run) + ", Score: " + str(state_score))
            writer.add_scalar('Score', state_score, run)

        if i % 50000 == 0:
            date_today = date.today()
            curr_time = datetime.now()
            formatted_time = curr_time.strftime('%H%M%S')
            save_model = agentBird.return_model()
            torch.save(
                save_model, "pretrained_model/easy_model_" + str(i) + "_" +
                str(date_today) + "_" + str(formatted_time) + ".pth")

示例#11

0

显示文件

文件： flappy.py 项目： vishal3410/flappy-bird-playing-agent

def train(model, start):
    optimizer = optim.Adam(model.parameters(), lr=1e-6)

    criterion = nn.MSELoss()

    game_state = GameState()

    replay_memory = []

    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    action[0] = 1
    img_data, reward, done = game_state.frame_step(action)
    img_data = preprocess(img_data)
    img_data = convert_img_to_tensor(img_data)
    state = torch.cat((img_data, img_data, img_data, img_data)).unsqueeze(0)

    epsilon = model.epsilon1
    iteration = 0

    epsilon_decrements = np.linspace(model.epsilon1, model.epsilon2,
                                     model.number_of_iterations)

    while iteration < model.number_of_iterations:
        output = model(state)[0]

        action = torch.zeros([model.number_of_actions], dtype=torch.float32)

        random_action = random.random() <= epsilon

        if random_action:
            print("Performed random action!")

        action_index = [
            torch.randint(
                model.number_of_actions, torch.Size([]), dtype=torch.int)
            if random_action else torch.argmax(output)
        ][0]

        action[action_index] = 1

        next_img_data, reward, done = game_state.frame_step(action)
        next_img_data = preprocess(next_img_data)
        next_img_data = convert_img_to_tensor(next_img_data)
        next_state = torch.cat(
            (state.squeeze(0)[1:, :, :], next_img_data)).unsqueeze(0)

        action = action.unsqueeze(0)
        reward = torch.from_numpy(np.array([reward],
                                           dtype=np.float32)).unsqueeze(0)

        replay_memory.append((state, action, reward, next_state, terminal))

        if len(replay_memory) > model.replay_memory_size:
            replay_memory.pop(0)

        epsilon = epsilon_decrements[iteration]

        batch = random.sample(replay_memory,
                              min(len(replay_memory), model.minibatch_size))

        state_memory = torch.cat(tuple(d[0] for d in batch))
        action_memory = torch.cat(tuple(d[1] for d in batch))
        reward_memory = torch.cat(tuple(d[2] for d in batch))
        next_state_memory = torch.cat(tuple(d[3] for d in batch))

        output_memory = model(next_state_memory)

        y_memory = torch.cat(
            tuple(reward_memory[i] if minibatch[i][4] else reward_memory[i] +
                  model.gamma * torch.max(output_memory[i])
                  for i in range(len(minibatch))))

        q_value = torch.sum(model(state_memory) * action_memory, dim=1)

        optimizer.zero_grad()

        y_memory = y_memory.detach()

        loss = criterion(q_value, y_memory)

        loss.backward()
        optimizer.step()

        state = next_state
        iteration += 1

        print("iteration:", iteration, "elapsed time:",
              time.time() - start, "epsilon:", epsilon, "action:",
              action_index.cpu().detach().numpy(), "reward:",
              reward.numpy()[0][0], "Q max:",
              np.max(output.cpu().detach().numpy()))