def validation_core(i, x, y, model, feature_count): (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i) mutualInformationTable = FeatureSelection.byMutualInformation(foldTrainX, foldTrainY) words = [word for word,_ in mutualInformationTable[:feature_count]] (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words) model.fit(xNewTrain, foldTrainY) return self.__countCorrect(model.predict(xNewValidation), foldValidationY)
def validateByFrequency(self, x, y, model): totalCorrect = 0 for i in range(self.k): (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i) frequencyTable = FeatureSelection.byFrequency(foldTrainX) words = [word for word,_ in frequencyTable[:10]] print('For fold %d/%d, choose words:' % (i + 1, self.k)) print(words) (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words) model.fit(xNewTrain, foldTrainY) totalCorrect += self.__countCorrect(model.predict(xNewValidation), foldValidationY) accuracy = totalCorrect / len(x) return accuracy
############################# print('### Get the Mutual Information Table') mutualInformationTable = FeatureSelection.byMutualInformation( xTrainRaw, yTrain) print('Top 10') for i in range(10): print(mutualInformationTable[i]) ############################# print('### Run Gradient Descent with the Top 10 Words by Frequency') words = [word for word, _ in frequencyTable[:10]] print(words) (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words) model.fit(xNewTrain, yTrain, iterations=50000, step=0.01) yTestPredicted = model.predict(xNewTest) testAccuracy = EvaluationsStub.Accuracy(yTest, yTestPredicted) print("Test Set Accuracy is %f" % (testAccuracy)) ############################# print('### Run Gradient Descent with the Top 10 Words by Mutual Information') words = [word for word, _ in mutualInformationTable[:10]] print(words) (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words) model.fit(xNewTrain, yTrain, iterations=50000, step=0.01) yTestPredicted = model.predict(xNewTest)
print("========== Preprocess the Data ==========") (xTrainRawNormalize, xTestRawNormalize) = FeatureSelection.preprocess(xTrainRaw, xTestRaw) print('========== Merge Features ==========') print('Use 5 Hand Craft Words as Features') (xTrainHand, xTestHand, featuresName) = FeatureSelection.hand_craft_features( xTrainRaw, xTestRaw, 2) print('Use 70 Mutual Information Words as Features') model = RandomForest.RandomForest(num_trees=num_trees, min_to_split=min_to_split, use_bagging=use_bagging, restrict_features=restrict_features) mutualInformationTable = FeatureSelection.byMutualInformation( xTrainRawNormalize, yTrain) words = [word for word, _ in mutualInformationTable[:70]] (xTrainMI, xTestMI) = FeatureSelection.Featurize(xTrainRawNormalize, xTestRawNormalize, words) xTrain = np.hstack([xTrainHand, xTrainMI]) xTest = np.hstack([xTestHand, xTestMI]) model.fit(xTrain, yTrain) yPredicted = model.predict(xTest) testAccuracy = EvaluationsStub.Accuracy(yTest, yPredicted) (lower, upper) = EvaluationsStub.Bound(testAccuracy, len(yPredicted)) print("Test Set Accuracy is %f, with lower bound %f and upper bound %f" % (testAccuracy, lower, upper)) print('========== Debug on raw data =========')
# (xTrainRaw, yTrainRaw, xTestRaw, yTestRaw) = Assignment1Support.TrainTestSplit(xRaw, yRaw) (xTrainRawOriginal, yTrainRawOriginal, xTestRawOriginal, yTestRawOriginal) = Assignment1Support.TrainTestSplit(xRaw, yRaw) (xTrainRaw, yTrainRaw) = AddNoise.MakeProblemHarder(xTrainRawOriginal, yTrainRawOriginal) (xTestRaw, yTestRaw) = AddNoise.MakeProblemHarder(xTestRawOriginal, yTestRawOriginal) (xTrain, xTest) = Assignment1Support.Featurize(xTrainRaw, xTestRaw) yTrain = yTrainRaw yTest = yTestRaw ### Get the Mutual Information Words as features import FeatureSelection print('### Get the Mutual Information features') mutualInformationTable = FeatureSelection.byMutualInformation(xTrainRaw, yTrain) words = [word for word,_ in mutualInformationTable[:295]] (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words) print('### Merge the features') xTrain = np.hstack([xTrain, xNewTrain]) xTest = np.hstack([xTest, xNewTest]) import RandomForest ############################ print("========== Building one Model and output the accuracy ==========") model = RandomForest.RandomForest(num_trees = 10, min_to_split = 2, use_bagging = True, restrict_features = 20) print("### Training with Random Forest") model.fit(xTrain, yTrain) print("### Predicting with Random Forest")