def evalOneFold(foldId): definedFeatureCosts = realdata.getFeaturesCosts(dataName) trainData, trainLabels, unlabeledData, testData, testLabels = realdata.loadSubset(dataName, None, foldId, imputationMethod) if USE_UNLABELED_DATA: assert(unlabeledData.shape[0] > 0) else: unlabeledData = numpy.zeros((0, trainData.shape[1])) allData = numpy.vstack((trainData, unlabeledData)) assert(definedFeatureCosts.shape[0] == allData.shape[1]) print("training data size = ", trainData.shape[0]) print("unlabeled data size = ", unlabeledData.shape[0]) print("test data size = ", testData.shape[0]) print("*****************************") print("foldId = ", foldId) print("*****************************") if FULL_MODEL: bestFixedFeatures = numpy.arange(trainData.shape[1]) # print("bestFixedFeatures = ", bestFixedFeatures) # assert(False) bestModel, misclassificationCosts, totalCostEstimate = prepareFeatureSets.getPredictionModelsAndCosts(trainData, trainLabels, bestFixedFeatures, definedFeatureCosts, falsePositiveCost, targetRecall, useTargetRecall = True, falseNegativeCost = None, classificationModelName = classificationModelName) else: if USE_L1: allFeatureSetsInOrder, _ = prepareFeatureSets.getAllFeatureSetsInOrderWithL1LogReg(trainData, trainLabels, unlabeledData, None, definedFeatureCosts) else: print("NOT YET SUPPORTED !!") assert(False) # allFeatureSetsInOrder, allEstimatedTotalCosts = prepareFeatureSets.getAllFeatureSetsInOrderWithGreedyMethod(trainData, trainLabels, unlabeledData, misclassificationCosts, definedFeatureCosts) print("GET ALL PREDICTION MODEL AND DETERMINE FALSE NEGATIVE COSTS: ") allPredictionModels, allMisclassificationCosts, allEstimatedTotalCosts = prepareFeatureSets.getAllPredictionModelsAndCosts(trainData, trainLabels, allFeatureSetsInOrder, definedFeatureCosts, falsePositiveCost, targetRecall, useTargetRecall = True, falseNegativeCost = None, classificationModelName = classificationModelName) bestModelId = numpy.argmin(allEstimatedTotalCosts) bestModel = allPredictionModels[bestModelId] misclassificationCosts = allMisclassificationCosts[bestModelId] bestFixedFeatures = allFeatureSetsInOrder[bestModelId] return evaluation.getOverallPerformance_fixedCovariateSet(bestModel, testData, testLabels, definedFeatureCosts, misclassificationCosts, bestFixedFeatures, targetRecall)
# only used to compare with the results from previous work on Diabetes data assert (dataName == "pima_5foldCV") ALL_FALSE_POSITIVE_COSTS = [400, 800] elif COST_TYPE == "asymmetricCost": ALL_FALSE_POSITIVE_COSTS = constants.allFalsePositiveCosts else: assert (False) DYNAMIC = "dynamic" STATIC = "static" FULL_MODEL = "fullModel" allVariations = [FULL_MODEL, DYNAMIC, STATIC] startTimeTotal = time.time() definedFeatureCosts = realdata.getFeaturesCosts(dataName) trainedModelsFilenameNonLinearL1 = dataName + "_" + classificationModelName + "_nonLinearL1" with open( constants.MODEL_FOLDERNAME + trainedModelsFilenameNonLinearL1 + "_models", "rb") as f: allPredictionModelsNonLinearL1_allFolds = pickle.load(f) with open( constants.MODEL_FOLDERNAME + trainedModelsFilenameNonLinearL1 + "_probs", "rb") as f: allTrainingTrueProbsAllModelsNonLinearL1_allFolds = pickle.load(f) with open( constants.MODEL_FOLDERNAME + trainedModelsFilenameNonLinearL1 + "_features", "rb") as f: allFeatureArraysInOrderNonLinearL1_allFolds = pickle.load(f)