예제 #1
0
                samples[i].append(None)
            memory.add_sample(samples[i])

        sample_batch = memory.sample_samples(batch_size)
        actual_batch_size = len(sample_batch)
        state_batch = np.zeros((actual_batch_size, 9))
        next_state_batch = np.zeros((actual_batch_size, 9))
        action_batch = [sample[1] for sample in sample_batch]
        
        for i, sample in enumerate(sample_batch):
            state_batch[i] = sample[0]
            if sample[3] is not None:
                next_state_batch[i] = sample[3]
            
        qsa_batch = model.predict_batch(state_batch, sess)

        for i in range(actual_batch_size):
            for choice in range(9):
                if state_batch[i, choice] != 0:
                    qsa_batch[i, choice] = -2
            if sample_batch[i][3] is None:
                qsa_batch[i, action_batch[i]] = sample_batch[i][2]
            else:
                qsa_batch[i, action_batch[i]] = sample_batch[i][2] + gamma*np.amax(model.predict_one(next_state_batch[i].reshape((1,9)), sess))
            
        model.train_batch(state_batch, qsa_batch, sess)
        
        epsilon = 0.9*np.exp(-0.001*game)
    model.save(sess)
    model.plot_losses()
예제 #2
0
        sample_batch = memory.sample_samples(batch_size)
        actual_batch_size = len(sample_batch)
        state_batch = np.zeros((actual_batch_size, 9))
        next_state_batch = np.zeros((actual_batch_size, 9))
        action_batch = [sample[1] for sample in sample_batch]

        for i, sample in enumerate(sample_batch):
            state_batch[i] = sample[0]
            if sample[3] is not None:
                next_state_batch[i] = sample[3]

        qsa_batch = model.predict_batch(state_batch, sess)

        for i in range(actual_batch_size):
            for choice in range(9):
                if state_batch[i, choice] != 0:
                    qsa_batch[i, choice] = invalid_move_reward
            if sample_batch[i][3] is None:
                qsa_batch[i, action_batch[i]] = sample_batch[i][2]
            else:
                qsa_batch[
                    i, action_batch[i]] = sample_batch[i][2] + gamma * np.amax(
                        model.predict_one(next_state_batch[i].reshape(
                            (1, 9)), sess))

        model.train_batch(state_batch, qsa_batch, sess)

        epsilon = 0.9 * np.exp(-0.001 * game)
    model.save(sess, 'tic_tac_toe_model')
    model.plot_losses()