def runWithIGR(featureSize, modelCount):
    X_raw, y = common.loadTrainingDataSet()

    reducer = InformationGainReducer()
    reducer.fit(X_raw, y)
    reducer.resize(featureSize)
    X = reducer.transform(X_raw).toarray()

    modelList = []

    for modelNum in range(modelCount):
        rs = 42 + modelNum
        rus = RandomUnderSampler(random_state=rs)
        X_model, y_model = rus.fit_resample(X, y)

        nbClassifier = NaiveBayesClassifier()
        nbClassifier.fit(X_model, y_model)

        modelList.append(nbClassifier)

    X_test_raw = common.loadTestDataSet()
    X_test = reducer.transform(X_test_raw).toarray()
    combinedModelOutput = common.predictCombinedSimple(X_test, modelList)

    common.writeResultsFile(combinedModelOutput)
    print("Done predicting with multi-model and IGR.")
def runWithBalancingAndIGR(featureSize, alphaValue):
    X_model_full_imbalanced, y_model_imbalanced = common.loadTrainingDataSet()

    balancer = FeatureIndependentOversampler(random_state=42)
    X_model_full_raw, y_model_raw = balancer.fit_transform(
        X_model_full_imbalanced, y_model_imbalanced)

    X_model_full, y_model = shuffle(X_model_full_raw,
                                    y_model_raw,
                                    random_state=42)

    reducer = InformationGainReducer()
    reducer.fit(X_model_full, y_model)

    reducer.resize(featureSize)
    X_model = reducer.transform(X_model_full).todense()

    hiddenLayerSizes = (int(math.sqrt(featureSize)) + 1, )
    mc = MLPClassifier(solver='lbfgs',
                       alpha=alphaValue,
                       hidden_layer_sizes=hiddenLayerSizes)
    mc.fit(X_model, y_model)

    X_test_full = common.loadTestDataSet()
    X_test = reducer.transform(X_test_full)

    output = mc.predict(X_test)
    common.writeResultsFile(output)

    print("Done estimating with neural network for feature size = " +
          str(featureSize) + " and alpha = " + str(alphaValue))
示例#3
0
def testCustomUnderfitting():
    #####################
    # Part 1. Balance the dataset
    #####################
    X, y = common.loadTrainingDataSet()
    #xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(xRawData, yRawData)

    #tuneFeatureCountWithChiSquared(xBalanced, yBalanced)
    #tuneFeatureCountWithTruncatedSVD(X, y)

    datasets = testMultiModel(X, y, 9)

    for modelNum in range(9):
        dataset = datasets[modelNum]
        X_sub = dataset[0]
        y_sub = dataset[1]

        reducer = feature_reduction.getChiSquared(X_sub, y_sub, 300)

        X_sub_new = feature_reduction.transform(reducer, X_sub)
        y_sub_new = np.array(y_sub, dtype=np.int64)

        modelScore = getAvgF1Score(X_sub_new, y_sub_new)
        print("Model score for model " + str(modelNum) + " = " +
              str(modelScore))
def tuneMultimodelKnnIgr(featureSizes, kValues):
    X_raw, y_raw = common.loadTrainingDataSet()

    scoreMap = dict()
    for featureSize in featureSizes:
        for kValue in kValues:
            scoreMap[(featureSize, kValue)] = []

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    foldNumber = 0

    for train_index, test_index in kf.split(X_raw):
        X_train, X_test = X_raw[train_index], X_raw[test_index]
        y_train, y_test = y_raw[train_index], y_raw[test_index]

        reducer = InformationGainReducer()
        reducer.fit(X_train, y_train)

        for featureSize in featureSizes:
            reducer.resize(featureSize)
            X_train_reduced = reducer.transform(X_train).toarray()
            X_test_reduced = reducer.transform(X_test).toarray()

            for kValue in kValues:
                modelList = []

                for modelNum in range(11):
                    rus_rs = 555 + (modelNum * featureSize)
                    rus = RandomUnderSampler(random_state=rus_rs)
                    X_model, y_model = rus.fit_resample(
                        X_train_reduced, y_train)

                    clf = KNeighborsClassifier(n_neighbors=kValue,
                                               metric='manhattan')
                    clf.fit(X_model, y_model)

                    modelList.append(clf)
                    print(".", end="")

                output = common.predictCombinedSimple(X_test_reduced,
                                                      modelList)
                combinedModelScore = f1_score(y_test, output)
                scoreMap[(featureSize, kValue)].append(combinedModelScore)

                print()
                print("Done with kValue = " + str(kValue) + " for fold #" +
                      str(foldNumber) + " for feature size = " +
                      str(featureSize) + ". F1 = " + str(combinedModelScore))

            print("Done with fold #" + str(foldNumber) +
                  " for feature size = " + str(featureSize))

        foldNumber += 1

    for featureSize in featureSizes:
        for kValue in kValues:
            meanF1Score = mean(scoreMap[(featureSize, kValue)])
            print("F1 Score for KNN with IGR, K = " + str(kValue) +
                  " and FR size = " + str(featureSize) + " is: " +
                  str(meanF1Score))
def runWithUndersamplingMutualInfo():
    X, y = common.loadTrainingDataSet()

    print("Counter(y) = " + str(Counter(y)))

    rus = RandomUnderSampler(random_state=42)

    X_res, y_res = rus.fit_resample(X, y)

    print("Counter(y_res) = " + str(Counter(y_res)))

    reducer = SelectKBest(mutual_info_classif, 300)
    X_new = reducer.fit_transform(X_res, y_res).toarray()

    print("Done with feature selection")

    #reducer = feature_reduction.getChiSquared(X_res, y_res, 1331)
    #featureReducer = SelectKBest(chi2, k=j)
    #featureReducer.fit(X, y)

    #X_new = feature_reduction.transform(reducer, X_res)

    nbClf = NaiveBayesClassifier()
    nbClf.fit(X_new, y_res)

    X_test = common.loadTestDataSet()
    X_test_new = reducer.transform(X_test).toarray()
    testPredictions = nbClf.predict(X_test_new)

    print("Test predictions shape = " + str(testPredictions.shape))
    print("Test Estimates = " + str(testPredictions))
    common.writeResultsFile(testPredictions)
    print("Done!")
def runWithOversampling():
    #####################
    # Part 1. Balance the dataset
    #####################
    xRawData, yRawData = common.loadTrainingDataSet()

    xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(
        xRawData, yRawData)

    #####################
    # Part 2. Feature Reduction
    #####################
    featureReducer = SelectKBest(chi2, k=10000)
    featureReducer.fit(xBalanced, yBalanced)

    xReduced = featureReducer.transform(xBalanced).todense()

    nbClassifier = NaiveBayesClassifier()
    nbClassifier.fit(xReduced, yBalanced)

    rawTestData = common.loadTestDataSet()
    reducedTestData = featureReducer.transform(rawTestData).todense()

    resultsArray = nbClassifier.predict(reducedTestData)

    common.writeResultsFile(resultsArray)
示例#7
0
def tuneNaiveBayesIgrFeatureSize(featureSizeList, modelCountList):
    X_raw, y = common.loadTrainingDataSet()

    reducer = InformationGainReducer()
    reducer.fit(X_raw, y)

    for featureSize in featureSizeList:
        reducer.resize(featureSize)
        X = reducer.transform(X_raw).toarray()

        #print("Counter(y) = " + str(Counter(y)))
        for modelCount in modelCountList:
            kf = KFold(n_splits=5, random_state=42, shuffle=True)
            splitIndex = 0
            f1ScoreList = []

            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                modelList = []

                for modelNum in range(modelCount):
                    rs = 42 + modelNum
                    rus = RandomUnderSampler(random_state=rs)
                    X_model, y_model = rus.fit_resample(X_train, y_train)

                    nbClassifier = NaiveBayesClassifier()
                    nbClassifier.fit(X_model, y_model)

                    #X_test_2 = reducer.transform(X_test).toarray()
                    #output = nbClassifier.predict(X_test_2)
                    #modelScore = f1_score(y_test, output)

                    #print("Split Index = " + str(splitIndex) + ", Model Num = " + str(modelNum) + ", F1 = " + str(modelScore))

                    modelList.append(nbClassifier)
                    #print(".", end='')
                #print()

                combinedModelOutput = common.predictCombinedSimple(
                    X_test, modelList)
                combinedModelScore = f1_score(y_test, combinedModelOutput)
                f1ScoreList.append(combinedModelScore)
                #print("Combined Model Score for split #" + str(splitIndex) + " = " + str(combinedModelScore))

                splitIndex += 1

            print("F1 Score for FR size = " + str(featureSize) +
                  " and model count = " + str(modelCount) + " is: " +
                  str(mean(f1ScoreList)))
示例#8
0
def tuneMultimodelIGR(featureSizes):
    X_raw, y_raw = common.loadTrainingDataSet()

    scoreMap = dict()
    for featureSize in featureSizes:
        scoreMap[featureSize] = []

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    foldNumber = 0

    for train_index, test_index in kf.split(X_raw):
        X_train, X_test = X_raw[train_index], X_raw[test_index]
        y_train, y_test = y_raw[train_index], y_raw[test_index]

        reducer = InformationGainReducer()
        reducer.fit(X_train, y_train)

        for featureSize in featureSizes:
            reducer.resize(featureSize)
            X_train_reduced = reducer.transform(X_train).toarray()

            modelList = []

            for modelNum in range(11):
                rus_rs = 555 + modelNum
                rus = RandomUnderSampler(random_state=rus_rs)
                X_model, y_model = rus.fit_resample(X_train_reduced, y_train)

                nbClassifier = NaiveBayesClassifier()
                nbClassifier.fit(X_model, y_model)

                modelList.append(nbClassifier)
                print(".", end="")

            X_test_reduced = reducer.transform(X_test).toarray()
            output = common.predictCombinedSimple(X_test_reduced, modelList)
            combinedModelScore = f1_score(y_test, output)
            scoreMap[featureSize].append(combinedModelScore)

            print()
            print("Done with fold #" + str(foldNumber) +
                  " for feature size = " + str(featureSize) + ". F1 = " +
                  str(combinedModelScore))

        foldNumber += 1

    for featureSize in featureSizes:
        meanF1Score = mean(scoreMap[featureSize])
        print("F1 Score for NN with Chi2 and FR size = " + str(featureSize) +
              " is: " + str(meanF1Score))
示例#9
0
def tuneMultimodelSvm(featureSizes):
    X_raw, y_raw = common.loadTrainingDataSet()

    scoreMap = dict()
    for featureSize in featureSizes:
        scoreMap[featureSize] = []

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    foldNumber = 0

    for train_index, test_index in kf.split(X_raw):
        X_train, X_test = X_raw[train_index], X_raw[test_index]
        y_train, y_test = y_raw[train_index], y_raw[test_index]

        for featureSize in featureSizes:
            reducer = TruncatedSVD(n_components=featureSize)
            X_train_reduced = reducer.fit_transform(X_train)

            modelList = []

            for modelNum in range(11):
                rus_rs = 555 + (modelNum * featureSize)
                rus = RandomUnderSampler(random_state=rus_rs)
                X_model, y_model = rus.fit_resample(X_train_reduced, y_train)

                clf = SVC(gamma='scale')
                clf.fit(X_model, y_model)

                modelList.append(clf)
                print(".", end="")

            X_test_reduced = reducer.transform(X_test)
            output = common.predictCombinedSimple(X_test_reduced, modelList)
            combinedModelScore = f1_score(y_test, output)
            scoreMap[featureSize].append(combinedModelScore)

            print()
            print("Done with fold #" + str(foldNumber) +
                  " for feature size = " + str(featureSize) + ". F1 = " +
                  str(combinedModelScore))

        foldNumber += 1

    for featureSize in featureSizes:
        meanF1Score = mean(scoreMap[featureSize])
        print("F1 Score for SVM with Truncated SVD and FR size = " +
              str(featureSize) + " is: " + str(meanF1Score))
def tuneBasicDecisionTree():
    # Some setup
    X_raw, y_raw = common.loadTrainingDataSet()

    #xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(xRawData, yRawData)
    #myCounter = Counter(yBalanced)
    #print("Finished loading and sampling. Data dist = " + str(myCounter))

    decisionTreeClassifier = DecisionTreeClassifier()
    cvFolds = 5  #constants.crossValidationFoldCount
    cvScores = cross_val_score(estimator=decisionTreeClassifier,
                               X=X_raw,
                               y=y_raw,
                               scoring='f1',
                               cv=cvFolds)
    print("Individual CV scores = " + str(cvScores))
    avg = sum(cvScores) / cvFolds
    print("Cross validation score for decision tree = " + str(avg))
示例#11
0
def tuneNaiveBayesFeatureReduction():
    X, y = common.loadTrainingDataSet()
    rus = RandomUnderSampler(random_state=42)

    X_res, y_res = rus.fit_resample(X, y)

    print("Counter(y_res) = " + str(Counter(y_res)))

    for j in common.getFeatureCountArray():
        reducer = feature_reduction.getChiSquared(X_res, y_res, j)
        #featureReducer = SelectKBest(chi2, k=j)
        #featureReducer.fit(X, y)

        X_new = feature_reduction.transform(reducer, X_res)

        f1 = getAvgF1Score(X_new, y_res)
        print("J = " + str(j) + ",     F1 =     " + str(f1))
    '''
示例#12
0
def tuneNaiveBayesMultiModel(featureSize, modelCount):
    X, y = common.loadTrainingDataSet()

    #print("Counter(y) = " + str(Counter(y)))

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    splitIndex = 0
    f1ScoreList = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        modelTransformerList = []

        for modelNum in range(modelCount):
            rs = 42 + modelNum
            rus = RandomUnderSampler(random_state=rs)
            X_model_full, y_model = rus.fit_resample(X_train, y_train)

            reducer = SelectKBest(chi2, k=featureSize)
            X_model = reducer.fit_transform(X_model_full, y_model).toarray()

            nbClassifier = NaiveBayesClassifier()
            nbClassifier.fit(X_model, y_model)

            #X_test_2 = reducer.transform(X_test).toarray()
            #output = nbClassifier.predict(X_test_2)
            #modelScore = f1_score(y_test, output)

            #print("Split Index = " + str(splitIndex) + ", Model Num = " + str(modelNum) + ", F1 = " + str(modelScore))

            modelTransformerList.append((nbClassifier, reducer))

        combinedModelOutput = common.predictCombined(X_test,
                                                     modelTransformerList)
        combinedModelScore = f1_score(y_test, combinedModelOutput)
        f1ScoreList.append(combinedModelScore)
        #print("Combined Model Score = " + str(combinedModelScore))

        splitIndex += 1

    print("F1 Score for FR size = " + str(featureSize) + " is: " +
          str(mean(f1ScoreList)))
示例#13
0
def tuneRandomForestDepth(depths):
    X_raw, y_raw = common.loadTrainingDataSet()

    scoreMap = dict()
    for depth in depths:
        scoreMap[depth] = []

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    foldNumber = 0

    for train_index, test_index in kf.split(X_raw):
        X_train, X_test = X_raw[train_index], X_raw[test_index]
        y_train, y_test = y_raw[train_index], y_raw[test_index]

        for depth in depths:
            reducer = SelectKBest(chi2, k=127)
            reducer.fit(X_train, y_train)
            X_train_reduced = reducer.transform(X_train).toarray()

            ss_rs = 42 + (depth * foldNumber)
            smoteSampler = SMOTE(random_state=ss_rs)

            X_model, y_model = smoteSampler.fit_resample(
                X_train_reduced, y_train)

            clf = RandomForestClassifier(max_depth=depth)
            clf.fit(X_model, y_model)

            X_test_reduced = reducer.transform(X_test).toarray()
            output = clf.predict(X_test_reduced)
            combinedModelScore = f1_score(y_test, output)
            scoreMap[depth].append(combinedModelScore)

            print()
            print("Done with RF prediction for fold #" + str(foldNumber) +
                  " for depth = " + str(depth) + ". F1 = " +
                  str(combinedModelScore))

        foldNumber += 1

    for depth in depths:
        meanF1Score = mean(scoreMap[depth])
        print("F1 Score for RF with Chi2 and depth = " + str(depth) + " is: " +
              str(meanF1Score))
def tuneDecisionTreeSmote(featureSizes):
    X_raw, y_raw = common.loadTrainingDataSet()

    scoreMap = dict()
    for featureSize in featureSizes:
        scoreMap[featureSize] = []

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    foldNumber = 0

    for train_index, test_index in kf.split(X_raw):
        X_train, X_test = X_raw[train_index], X_raw[test_index]
        y_train, y_test = y_raw[train_index], y_raw[test_index]

        for featureSize in featureSizes:
            reducer = SelectKBest(chi2, k=featureSize)
            reducer.fit(X_train, y_train)
            X_train_reduced = reducer.transform(X_train).toarray()

            ss_rs = 42 + (featureSize * foldNumber)
            smoteSampler = SMOTE(random_state=ss_rs)

            X_model, y_model = smoteSampler.fit_resample(
                X_train_reduced, y_train)

            dtClassifier = DecisionTreeClassifier(max_depth=10)
            dtClassifier.fit(X_model, y_model)

            X_test_reduced = reducer.transform(X_test).toarray()
            output = dtClassifier.predict(X_test_reduced)
            combinedModelScore = f1_score(y_test, output)
            scoreMap[featureSize].append(combinedModelScore)

            print()
            print("Done with DT prediction for fold #" + str(foldNumber) +
                  " for feature size = " + str(featureSize) + ". F1 = " +
                  str(combinedModelScore))

        foldNumber += 1

    for featureSize in featureSizes:
        meanF1Score = mean(scoreMap[featureSize])
        print("F1 Score for DT with Chi2 and FR size = " + str(featureSize) +
              " is: " + str(meanF1Score))
def runWithMultiModel():
    modelTransformerList = []
    X, y = common.loadTrainingDataSet()

    for modelNum in range(9):
        rs = 42 + modelNum
        rus = RandomUnderSampler(random_state=rs)
        X_model_full, y_model = rus.fit_resample(X, y)

        reducer = SelectKBest(chi2, k=105)
        X_model = reducer.fit_transform(X_model_full, y_model).toarray()

        nbClassifier = NaiveBayesClassifier()
        nbClassifier.fit(X_model, y_model)

        modelTransformerList.append((nbClassifier, reducer))

    X_test = common.loadTestDataSet()
    combinedModelOutput = common.predictCombined(X_test, modelTransformerList)
    common.writeResultsFile(combinedModelOutput)
    print("Done predicting with multi-model.")
def tuneReducedDecisionTreeWithFeatureSizeAndDepth(featureSize, depth):
    X, y = common.loadTrainingDataSet()

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    f1ScoreList = []

    foldNumber = 1

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        rus = RandomUnderSampler(random_state=foldNumber)
        X_model_full, y_model = rus.fit_resample(X_train, y_train)

        reducer = SelectKBest(chi2, k=featureSize)
        X_model1 = reducer.fit_transform(X_model_full, y_model)
        X_model = X_model1.tocsc()
        #reducer = TruncatedSVD(n_components=featureSize, n_iter=7, random_state=42)
        #X_model = reducer.fit_transform(X_train, y_train)

        dtClassifier = DecisionTreeClassifier(
            max_depth=depth,
            class_weight="balanced",
            #min_samples_split=0.01,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.01)
        dtClassifier.fit(X_model, y_model)

        X_model_test = reducer.transform(X_test).tocsr()
        y_pred = dtClassifier.predict(X_model_test)
        #report = classification_report(y_test, y_pred)
        currentF1 = f1_score(y_test, y_pred)
        f1ScoreList.append(currentF1)

        foldNumber += 1

    #print("f1 Score list = " + str(f1ScoreList))
    print("Mean for feature and depth of (" + str(featureSize) + ", " +
          str(depth) + ") = " + str(mean(f1ScoreList)))
def runBuiltInBernoulli():
    trainingDataMatrix, labelMatrix = common.loadTrainingDataSet()

    predictiveFeatures = feature_reduction.computePredictiveness(
        trainingDataMatrix, labelMatrix)

    #print("Performed feature selection. New shape is: " + str(trainingMatrix1.shape))

    bernoulliClf = BernoulliNB(alpha=constants.smoothingConstant,
                               binarize=None,
                               fit_prior=False)
    '''
    for j in range(5, 1001, 5):
        importantFeatures = [element[0] for element in predictiveFeatures[0:j]]
        #print("Important features = " + str(importantFeatures))
        importantFeaturesArray = np.array(importantFeatures)
        reducedDataSet = trainingDataMatrix[:, importantFeaturesArray]
    
        #print("Reduced data set shape = " + str(reducedDataSet.shape))
        cvScores = cross_val_score(estimator=bernoulliClf, X=reducedDataSet, y=labelMatrix, scoring='f1', cv=constants.crossValidationFoldCount)
    
        avg = sum(cvScores) / constants.crossValidationFoldCount
        print("My reducer. Feature Count = " + str(j) + "   Avg Score = " + str(avg))
    '''

    importantFeaturesArray = [
        element[0] for element in predictiveFeatures[0:205]
    ]
    reducedTraining = trainingDataMatrix[:, importantFeaturesArray]

    bernoulliClf.fit(reducedTraining, labelMatrix)

    testDataMatrix = common.loadTestDataSet()
    reducedTesting = testDataMatrix[:, importantFeaturesArray]
    testPredictions = bernoulliClf.predict(reducedTesting)

    print("Test predictions shape = " + str(testPredictions.shape))
    print("Test Estimates = " + str(testPredictions))
    common.writeResultsFile(testPredictions)
def tuneReducedDecisionTree():
    X, y = common.loadTrainingDataSet()

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    splitIndex = 0
    f1ScoreList = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        totalF1 = 0.0
        numModels = 9
        for modelNum in range(numModels):
            rs = 42 + modelNum
            rus = RandomUnderSampler(random_state=rs)
            X_model_full, y_model = rus.fit_resample(X_train, y_train)

            truncatedSvd = TruncatedSVD(n_components=331,
                                        n_iter=7,
                                        random_state=42)
            X_model = truncatedSvd.fit_transform(X_model_full, y_model)

            dtClassifier = DecisionTreeClassifier(ccp_alpha=0.015)
            dtClassifier.fit(X_model, y_model)

            X_model_test = truncatedSvd.transform(X_test)
            y_pred = dtClassifier.predict(X_model_test)
            #report = classification_report(y_test, y_pred)
            currentF1 = f1_score(y_test, y_pred)
            print("Printing F1 for model #" + str(modelNum) + " = " +
                  str(currentF1))
            #print(str(report))
            totalF1 += currentF1

        avgF1 = totalF1 / numModels
        print("f1 = " + str(avgF1))
示例#19
0
'''
Created on Mar 7, 2020

@author: William
'''

import cs584.project2.common as common
import cs584.project2.data_balancing as data_balancing

from collections import Counter
from sklearn import tree

if __name__ == '__main__':
    # Some setup
    xRawData, yRawData = common.loadTrainingDataSet()
    
    xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(xRawData, yRawData)
    myCounter = Counter(yBalanced)
    print("Finished loading and sampling. Data dist = " + str(myCounter))
    
    decisionTreeClassifier = tree.DecisionTreeClassifier()
    
    decisionTreeClassifier.fit(xBalanced, yBalanced)

def runBernoulliWithChiSquared():
    trainingDataMatrix, labelMatrix = common.loadTrainingDataSet()

    #predictiveFeatures = feature_reduction.computePredictiveness(trainingDataMatrix, labelMatrix)

    #print("Performed feature selection. New shape is: " + str(trainingMatrix1.shape))

    bernoulliClf = BernoulliNB(alpha=constants.smoothingConstant,
                               binarize=None,
                               fit_prior=False)
    '''
    maxAvg = 0
    maxK = -1
    
    for kVal in range(1025, 10000, 50):
        trainingMatrix1 = SelectKBest(chi2, k=kVal).fit_transform(trainingDataMatrix, labelMatrix)
        cvScores = cross_val_score(estimator=bernoulliClf, X=trainingMatrix1, y=labelMatrix, scoring='f1', cv=7)
        avg = sum(cvScores) / 7
        if avg > maxAvg:
            maxAvg = avg
            maxK = kVal
        
        print("k = " + str(kVal) + ", avg = " + str(avg))
    
    print("Best value is k = " + str(maxK) + ", " + str(maxAvg))
    '''
    featureReducer = SelectKBest(chi2, k=985)
    featureReducer.fit(trainingDataMatrix, labelMatrix)

    trainingMatrix1 = featureReducer.transform(trainingDataMatrix)

    cvScores = cross_val_score(estimator=bernoulliClf,
                               X=trainingMatrix1,
                               y=labelMatrix,
                               scoring='f1',
                               cv=7)
    avg = sum(cvScores) / 7
    print("k = 985, avg = " + str(avg))

    bernoulliClf.fit(trainingMatrix1, labelMatrix)
    '''
    estimateSet = trainingDataMatrix
    estimatePredictions = bernoulliClf.predict(estimateSet)
    print("estimates = " + str(estimatePredictions))
    
    results = np.zeros((2, 2), dtype=np.int)
    
    for i in range(len(trainDrugRecords)):
        actual = trainDrugRecords[i].label
        guess = int(estimatePredictions[i])
        #print("guess = " + str(guess) + ", actual = " + str(actual))
        results[guess, actual] += 1     
    
    print("results = " + str(results))
    '''

    testDataMatrix = common.loadTestDataSet()
    testMatrix1 = featureReducer.transform(testDataMatrix)
    testPredictions = bernoulliClf.predict(testMatrix1)

    print("Test predictions shape = " + str(testPredictions.shape))
    print("Test Estimates = " + str(testPredictions))
    common.writeResultsFile(testPredictions)