示例#1
0
    def _get_pos_neg_outcomes(self, proc_data, m):
        """
        Saves the probabilities of an attribute being a value, given outcome 0 or 1.  The 
        format of how the data is saved is explained below above the dict instantiations
        """

        print("Starting probability estimates")
        # This will store as key the attribute index, and have value be list of probabilities, with
        # each index being the attr value and each value being the probability of that attr value given positive class
        data_pred_pos_outcome = {}
        # This will store as key the attribute index, and have value be list of probabilities, with
        # each index being the attr value and each value being the probability of that attr value given negative class
        data_pred_neg_outcome = {}
        class_idx = utils._get_class_idx(proc_data)
        for attr_idx in range(len(proc_data.iloc[0, :])):
            p_data_given_y0 = self._p_data_given_y(proc_data, attr_idx,
                                                   class_idx, 0, m)
            p_data_given_y1 = self._p_data_given_y(proc_data, attr_idx,
                                                   class_idx, 1, m)
            # If returns are -1, then attribute is a class or index col., which we don't want
            if p_data_given_y0 is not None and p_data_given_y1 is not None:
                # Store list of probabilities corresponding to attr val under dict key of attr idx
                data_pred_pos_outcome[str(attr_idx)] = p_data_given_y1
                data_pred_neg_outcome[str(attr_idx)] = p_data_given_y0
        return data_pred_pos_outcome, data_pred_neg_outcome
示例#2
0
 def __init__(self, data, validationType, bins, mEstimate):
     self.validationType = validationType
     self.bins = bins
     self.mEstimate = mEstimate
     proc_data = utils._convert_exampleset_to_dataframe(data)
     # Convert all continuous attributes to classes
     self._convert_data(proc_data, bins)
     # Get P(Xi = xi | Y = y) for all Xi, xi, and y.  Specific data structure
     # storage details are discussed in helper methods
     data_pred_pos_outcome, data_pred_neg_outcome = self._get_pos_neg_outcomes(
         proc_data, mEstimate)
     # Store probabilities of attributes
     self.pos_outcomes = data_pred_pos_outcome
     self.neg_outcomes = data_pred_neg_outcome
     # Store class probability
     class_idx = utils._get_class_idx(proc_data)
     class_data = proc_data.iloc[:, class_idx]
     self.p_pos_class = self._get_n_class(class_data, 1) / len(class_data)
示例#3
0
 def __init__(self, data, validationType, bins, mEstimate,
              training_weights):
     self.validationType = validationType
     self.bins = bins
     self.mEstimate = mEstimate
     proc_data = data  # already processed in ensemble version
     # Convert all continuous attributes to classes
     self._convert_data(proc_data, bins)
     # Get P(Xi = xi | Y = y) for all Xi, xi, and y.  Specific data structure
     # storage details are discussed in helper methods
     data_pred_pos_outcome, data_pred_neg_outcome = self._get_pos_neg_outcomes(
         proc_data, mEstimate, training_weights)
     # Store probabilities of attributes
     self.pos_outcomes = data_pred_pos_outcome
     self.neg_outcomes = data_pred_neg_outcome
     # Store class probability
     class_idx = utils._get_class_idx(proc_data)
     class_data = proc_data.iloc[:, class_idx]
     self.p_pos_class = self._get_n_class(
         class_data, 1, training_weights) / sum(training_weights)
示例#4
0
def logreg(schema, exampleSet, validationType, constant, k=5):
    if validationType == 0:
        # 5-Fold Stratified CROSS VALIDATION
        folds = stratified_split_data(schema, exampleSet, k)
        print("-------", k, "- Fold Stratified Cross Validation --------")

        total_acc = []
        total_prec = []
        total_recal = []
        total_original_results = []
        total_predictions = []
        for i in range(k):
            #Create the buildSet
            buildSet = mldata.ExampleSet(schema)
            for j in range(k):
                if i != j:
                    for example in (folds[j]):
                        buildSet.append(example)
            print("Fold Iteration:", i)
            test = utils._convert_exampleset_to_dataframe(folds[i])
            class_idx = utils._get_class_idx(test)
            #classifier = NaiveBayes(buildSet, validationType, bins, Mestimate)
            classifier = LogisticRegression(buildSet, constant)
            predictions = classifier.predict(test)

            print("Calculating output of this fold.")
            original_results = []
            for l in range(len(test)):
                original_results.append(test.iloc[l, class_idx])
            TruePos = 0
            TrueNeg = 0
            FalsePos = 0
            FalseNeg = 0
            for m in range(len(predictions)):
                if predictions[m][1] == 1 and original_results[m] == 1:
                    TruePos += 1
                elif predictions[m][1] == 0 and original_results[m] == 0:
                    TrueNeg += 1
                elif predictions[m][1] == 1 and original_results[m] == 0:
                    FalsePos += 1
                elif predictions[m][1] == 0 and original_results[m] == 1:
                    FalseNeg += 1
                else:
                    print("YOU MESSED UP:", i)
            assert len(predictions) == (
                TrueNeg + TruePos + FalseNeg + FalsePos
            ), "...OH NO, Sum of results doesn't equal num of results..."

            total_acc.append((TrueNeg + TruePos) /
                             (TrueNeg + TruePos + FalseNeg + FalsePos))
            print("Error for fold: " +
                  str(1 - (TrueNeg + TruePos) /
                      (TrueNeg + TruePos + FalseNeg + FalsePos)))
            if TruePos + FalsePos > 0:
                total_prec.append((TruePos) / (TruePos + FalsePos))
            elif TruePos + FalsePos + FalseNeg == 0:
                total_prec.append(1)
            else:
                total_prec.append(0)
            if TruePos + FalseNeg > 0:
                total_recal.append((TruePos) / (TruePos + FalseNeg))
            elif TruePos + FalsePos + FalseNeg == 0:
                total_recal.append(1)
            else:
                total_recal.append(0)
            if i == 0:
                total_predictions = predictions
                total_original_results = original_results
            else:
                total_predictions = np.concatenate(
                    (total_predictions, predictions), axis=0)
                total_original_results = np.concatenate(
                    (total_original_results, original_results), axis=0)

        #after folds are done
        TPR = []
        FPR = []
        increment = 0.1
        threshold = 1.0
        while threshold >= 0:
            TP = 0
            FP = 0
            TN = 0
            FN = 0
            for i in range(0, len(total_predictions)):
                if total_predictions[i][
                        0] >= threshold and total_original_results[i] == 1:
                    TP += 1
                elif total_predictions[i][
                        0] >= threshold and total_original_results[i] == 0:
                    FP += 1
                elif total_predictions[i][
                        0] < threshold and total_original_results[i] == 1:
                    FN += 1
                elif total_predictions[i][
                        0] < threshold and total_original_results[i] == 0:
                    TN += 1
                else:
                    print("YOU MESSED UP:", i, total_predictions[i],
                          total_original_results[i])
            assert len(total_predictions) == (
                TN + TP + FN + FP), "...OH NO, pred doens't equal original..."

            TPR.append(TP / (TP + FN))
            FPR.append(FP / (FP + TN))
            threshold -= increment
        print("TPR: ", TPR)
        print("FPR: ", FPR)

        AUR = 0.0

        for trap in range(0, len(TPR) - 1):
            xDis = (FPR[trap + 1] - FPR[trap])
            yDis = (TPR[trap] + TPR[trap + 1]) / 2
            AUR += xDis * yDis

        if AUR < 0.5:
            print("1 - AUR used")
            AUR = 1.0 - AUR

        avg_acc = np.average(total_acc)
        avg_pre = np.average(total_prec)
        avg_rec = np.average(total_recal)

        std_acc = np.std(total_acc)
        std_pre = np.std(total_prec)
        std_rec = np.std(total_recal)

        print("===== Folds Complete =====")
        print("Average Accuracy   :", round(avg_acc, 3), round(std_acc, 3))
        print("Average Precision  :", round(avg_pre, 3), round(std_pre, 3))
        print("Average Recall     :", round(avg_rec, 3), round(std_rec, 3))
        print("Area Under ROC     :", round(AUR, 3))

    elif validationType == 1:
        print(
            "------- NO Cross Validation: Running on Full Example Set --------"
        )
        #NO CROSS VALIDATION
        total_acc = 0.0
        total_prec = 0.0
        total_recal = 0.0
        test = utils._convert_exampleset_to_dataframe(exampleSet)
        class_idx = utils._get_class_idx(test)
        #classifier = NaiveBayes(exampleSet, validationType, bins, Mestimate)
        classifier = LogisticRegression(exampleSet, constant)
        predictions = classifier.predict(test)

        print("Calculating output")
        original_results = []
        for l in range(len(test)):
            original_results.append(test.iloc[l, class_idx])
        TruePos = 0
        TrueNeg = 0
        FalsePos = 0
        FalseNeg = 0
        for m in range(len(predictions)):
            if predictions[m][1] == 1 and original_results[m] == 1:
                TruePos += 1
            elif predictions[m][1] == 0 and original_results[m] == 0:
                TrueNeg += 1
            elif predictions[m][1] == 1 and original_results[m] == 0:
                FalsePos += 1
            elif predictions[m][1] == 0 and original_results[m] == 1:
                FalseNeg += 1
            else:
                print("YOU MESSED UP:", i)
        assert len(predictions) == (
            TrueNeg + TruePos + FalseNeg + FalsePos
        ), "...OH NO, Sum of results doesn't equal num of results..."

        total_acc = (TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg +
                                           FalsePos)
        total_prec = (TruePos) / (TruePos + FalsePos)
        total_recal = (TruePos) / (TruePos + FalseNeg)

        #after folds are done
        TPR = []
        FPR = []
        increment = 0.1
        threshold = 1.0
        while threshold >= 0:
            TP = 0
            FP = 0
            TN = 0
            FN = 0
            for i in range(0, len(predictions)):
                if predictions[i][0] >= threshold and original_results[i] == 1:
                    TP += 1
                elif predictions[i][0] >= threshold and original_results[
                        i] == 0:
                    FP += 1
                elif predictions[i][0] < threshold and original_results[i] == 1:
                    FN += 1
                elif predictions[i][0] < threshold and original_results[i] == 0:
                    TN += 1
                else:
                    print("YOU MESSED UP:", i, predictions[i],
                          original_results[i])
            assert len(predictions) == (
                TN + TP + FN + FP), "...OH NO, pred doens't equal original..."

            TPR.append(TP / (TP + FN))
            FPR.append(FP / (FP + TN))
            threshold -= increment

        print("TPR: ", TPR)
        print("FPR: ", FPR)

        AUR = 0.0

        for trap in range(0, len(TPR) - 1):
            xDis = (FPR[trap + 1] - FPR[trap])
            yDis = (TPR[trap] + TPR[trap + 1]) / 2
            AUR += xDis * yDis

        if AUR < 0.5:
            print("1 - AUR used")
            AUR = 1.0 - AUR

        print("===== Run Complete =====")
        print("Average Accuracy   :", round(total_acc, 3))
        print("Average Precision  :", round(total_prec, 3))
        print("Average Recall     :", round(total_recal, 3))
        print("Area Under ROC     :", round(AUR, 3))

    else:
        print("Incorrect validation type argument given.")