def validation_core(i, x, y, model, feature_count):
            (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i)

            mutualInformationTable = FeatureSelection.byMutualInformation(foldTrainX, foldTrainY)
            words = [word for word,_ in mutualInformationTable[:feature_count]]
            (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words)

            model.fit(xNewTrain, foldTrainY)
            return self.__countCorrect(model.predict(xNewValidation), foldValidationY)
    def validateByFrequency(self, x, y, model):
        totalCorrect = 0

        for i in range(self.k):
            (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i)

            frequencyTable = FeatureSelection.byFrequency(foldTrainX)
            words = [word for word,_ in frequencyTable[:10]]
            print('For fold %d/%d, choose words:' % (i + 1, self.k))
            print(words)
            (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words)

            model.fit(xNewTrain, foldTrainY)
            totalCorrect += self.__countCorrect(model.predict(xNewValidation), foldValidationY)

        accuracy = totalCorrect / len(x)

        return accuracy
#############################

print('### Get the Mutual Information Table')

mutualInformationTable = FeatureSelection.byMutualInformation(
    xTrainRaw, yTrain)
print('Top 10')
for i in range(10):
    print(mutualInformationTable[i])

#############################

print('### Run Gradient Descent with the Top 10 Words by Frequency')
words = [word for word, _ in frequencyTable[:10]]
print(words)
(xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words)

model.fit(xNewTrain, yTrain, iterations=50000, step=0.01)
yTestPredicted = model.predict(xNewTest)
testAccuracy = EvaluationsStub.Accuracy(yTest, yTestPredicted)
print("Test Set Accuracy is %f" % (testAccuracy))

#############################

print('### Run Gradient Descent with the Top 10 Words by Mutual Information')
words = [word for word, _ in mutualInformationTable[:10]]
print(words)
(xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words)

model.fit(xNewTrain, yTrain, iterations=50000, step=0.01)
yTestPredicted = model.predict(xNewTest)
    print("========== Preprocess the Data ==========")
    (xTrainRawNormalize,
     xTestRawNormalize) = FeatureSelection.preprocess(xTrainRaw, xTestRaw)
    print('========== Merge Features ==========')
    print('Use 5 Hand Craft Words as Features')
    (xTrainHand, xTestHand,
     featuresName) = FeatureSelection.hand_craft_features(
         xTrainRaw, xTestRaw, 2)

    print('Use 70 Mutual Information Words as Features')
    model = RandomForest.RandomForest(num_trees=num_trees,
                                      min_to_split=min_to_split,
                                      use_bagging=use_bagging,
                                      restrict_features=restrict_features)
    mutualInformationTable = FeatureSelection.byMutualInformation(
        xTrainRawNormalize, yTrain)
    words = [word for word, _ in mutualInformationTable[:70]]
    (xTrainMI, xTestMI) = FeatureSelection.Featurize(xTrainRawNormalize,
                                                     xTestRawNormalize, words)

    xTrain = np.hstack([xTrainHand, xTrainMI])
    xTest = np.hstack([xTestHand, xTestMI])

    model.fit(xTrain, yTrain)
    yPredicted = model.predict(xTest)
    testAccuracy = EvaluationsStub.Accuracy(yTest, yPredicted)
    (lower, upper) = EvaluationsStub.Bound(testAccuracy, len(yPredicted))
    print("Test Set Accuracy is %f, with lower bound %f and upper bound %f" %
          (testAccuracy, lower, upper))

    print('========== Debug on raw data =========')
示例#5
0
    # (xTrainRaw, yTrainRaw, xTestRaw, yTestRaw) = Assignment1Support.TrainTestSplit(xRaw, yRaw)
    (xTrainRawOriginal, yTrainRawOriginal, xTestRawOriginal, yTestRawOriginal) = Assignment1Support.TrainTestSplit(xRaw, yRaw)
    (xTrainRaw, yTrainRaw) = AddNoise.MakeProblemHarder(xTrainRawOriginal, yTrainRawOriginal)
    (xTestRaw, yTestRaw) = AddNoise.MakeProblemHarder(xTestRawOriginal, yTestRawOriginal)

    (xTrain, xTest) = Assignment1Support.Featurize(xTrainRaw, xTestRaw)
    yTrain = yTrainRaw
    yTest = yTestRaw

    ### Get the Mutual Information Words as features
    import FeatureSelection

    print('### Get the Mutual Information features')
    mutualInformationTable = FeatureSelection.byMutualInformation(xTrainRaw, yTrain)
    words = [word for word,_ in mutualInformationTable[:295]]
    (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words)

    print('### Merge the features')
    xTrain = np.hstack([xTrain, xNewTrain])
    xTest = np.hstack([xTest, xNewTest])

    import RandomForest
    ############################

    print("========== Building one Model and output the accuracy ==========")

    model = RandomForest.RandomForest(num_trees = 10, min_to_split = 2, use_bagging = True, restrict_features = 20)
    print("### Training with Random Forest")
    model.fit(xTrain, yTrain)

    print("### Predicting with Random Forest")