示例#1
0
def stratified_split_data(schema, exampleSet, numFolds):
    posData = mldata.ExampleSet(schema)
    negData = mldata.ExampleSet(schema)
    np.random.seed(PRNG)
    folds = [mldata.ExampleSet(schema) for i in range(numFolds)]

    #split input by label
    for example in exampleSet:
        if example[len(example) - 1] == 1:
            posData.append(example)
        else:
            negData.append(example)

    while len(posData) > 0:
        for i in range(numFolds):
            if len(posData) == 0:
                break
            x = np.random.randint(0,
                                  len(posData))  #get random index on the input
            folds[i].append(posData[x])  #add the element to the fold
            del posData[x]  #remove from the input set

    while len(negData) > 0:
        for i in range(numFolds):
            if len(negData) == 0:
                break
            x = np.random.randint(0,
                                  len(negData))  #get random index on the input
            folds[i].append(negData[x])  #add the element to the fold
            del negData[x]  #remove from the input set

    return folds
示例#2
0
def fold_cv(full_dataset,
            num_folds):  #Divide full_dataset into stratified 5-folds
    #Separate the full_dataset into two sets in terms of the label
    true_set = mldata.ExampleSet(ex for ex in full_dataset if ex[-1] == True)
    false_set = mldata.ExampleSet(ex for ex in full_dataset if ex[-1] == False)
    shuffle(true_set)
    shuffle(false_set)

    #Calculate the length of each set
    true_len = len(true_set)
    true_len_part = true_len / num_folds
    false_len = len(false_set)
    false_len_part = false_len / num_folds

    datasets = []

    for i in range(num_folds):
        dataset = mldata.ExampleSet()
        for j in range(int(i * true_len_part), int((i + 1) * true_len_part)):
            dataset.append(true_set[j])
        for j in range(int(i * false_len_part), int((i + 1) * false_len_part)):
            dataset.append(false_set[j])
        datasets.append(dataset)

    return datasets
示例#3
0
def get_train_test_split(folds: Sequence, test_fold_ind: int) -> Tuple:
	"""Creates the training and test sets from dataset folds.
	Args:
		folds: A sequence of sequences, each being a fold of the data.
		test_fold_ind: Index of the training set fold.

	Returns:
		A training set and test set.
	"""
	train_folds = [folds[i] for i in range(len(folds)) if i != test_fold_ind]
	train_set = mldata.ExampleSet(functools.reduce(operator.add, train_folds))
	test_set = mldata.ExampleSet(folds[test_fold_ind])
	return train_set, test_set
示例#4
0
def boost_data(data, weights):
	ints = correctratios(weights)
	replicated_data = []
	for i, count in enumerate(ints):
		replicated_data.extend([data[i]] * count)
	eset = mldata.ExampleSet(d for d in replicated_data)
	return eset
示例#5
0
def build_trees(datasets):  #Build a tree using datasets as training data
    #Initialize lists to save outputs
    trees = []
    sizes = []
    max_depths = []
    first_features = []
    accs = []

    #Build each tree and output results
    for i in range(5):
        train_data = mldata.ExampleSet()
        for j in range(1, 5):
            for index in range(len(datasets[(i + j) % 5])):
                train_data.append(datasets[(i + j) % 5][index])
        val_data = datasets[i]
        shuffle(train_data)
        shuffle(val_data)
        tree = build_tree.build_DecisionTree(MAX_DEPTH, EPS, train_data,
                                             ENABLE_GAIN)
        size = tree.get_tree_size()
        max_depth = tree.get_tree_depth()
        trees.append(tree)
        sizes.append(size)
        max_depths.append(max_depth)
        first_feature_index = tree.get_root().get_attriIndex()
        first_feature = train_data.schema.features[first_feature_index].name
        first_features.append(first_feature)
        acc = tree.classify_dataset(val_data)
        accs.append(acc)
        print(
            'Tree %d:\n\nAccuracy: %.4f\n\nSize: %d\n\nMaximum Depth: %d\n\nFirst Feature: %s'
            % (i + 1, acc, size, max_depth, first_feature))

    return trees, sizes, first_features, accs, max_depths
示例#6
0
def cross_logreg(original_data):
    datasets = fold_5_cv(original_data)
    accuracies = []
    precisions = []
    recalls = []
    for i in range(5):
        train_data = mldata.ExampleSet()
        for j in range(1, 5):
            for index in range(len(datasets[(i + j) % 5])):
                train_data.append(datasets[(i + j) % 5][index])
        val_data = datasets[i]
        shuffle(train_data)
        shuffle(val_data)
        lg = Logistic_Regression(lambdaa=LAMBDA,
                                 training_data=train_data,
                                 iteration=ITER,
                                 learning_rate=LR)
        predictions, true_label = lg.classify_data(val_data)
        accuracy, precision, recall = get_results(predictions, true_label)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        print(
            "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n" %
            (i + 1, accuracy, precision, recall))
    return accuracies, precisions, recalls
示例#7
0
def standardize(data: mldata.ExampleSet) -> mldata.ExampleSet:
    """Standardizes (center and scales) all continuous features.
	Args:
		data: Collection of examples to pre-process.

	Returns:
		An ExampleSet with standardized continuous features.
	"""
    continuous_exs = mlutil.get_feature_examples(data=data,
                                                 feature_types={CONTINUOUS},
                                                 as_dict=True,
                                                 index_as_key=True)
    if len(continuous_exs) == 0:
        return data
    standardized = {i: stats.zscore(exs) for i, exs in continuous_exs.items()}
    examples = []
    for e, ex_val in enumerate(data):
        example = mldata.Example(data.schema)
        # f is the feature number; e is the example index
        example.features = [
            standardized[f][e] if f in standardized else ex_val[f]
            for f in range(len(data.schema))
        ]
        examples.append(example)
    example_set = mldata.ExampleSet(data.schema)
    example_set.extend(examples)
    return example_set
示例#8
0
 def predict(self, data: mldata.ExampleSet) -> Tuple:
     if self.model is None:
         predictions = tuple()
     else:
         predictions = tuple(
             self._predict_example(mldata.ExampleSet([example]), self.model)
             for example in data)
     return predictions
示例#9
0
def dtree(exampleSet, validationType, depth, splitCriterion, k=5):
    e = ns.EntropySelector(exampleSet)

    if validationType == 0:
        # 5-Fold Stratified CROSS VALIDATION
        folds = stratified_split_data(exampleSet, k)
        print("-------", k, "- Fold Stratified Cross Validation --------")

        total_acc = 0

        for i in range(k):
            #Create the buildSet
            buildSet = mldata.ExampleSet()
            for j in range(k):
                if i != j:
                    buildSet.append(folds[j])
            #Build tree and output for each fold
            #print(buildSet)
            tree = dt.build_tree(buildSet, e, depth, splitCriterion)
            acc = accuracy(tree, folds[i])
            print("Fold Iteration:", i)
            print("Accuracy     :", acc)
            print("Size         :", tree.size)
            print("Maximum Depth:", tree.depth)
            print("First Feature:", tree.headnode.name)
            total_acc += acc

        print("Average Accuracy:", total_acc / k)

    elif validationType == 1:
        print(
            "------- NO Cross Validation: Running on Full Example Set --------"
        )
        #NO CROSS VALIDATION
        tree = dt.build_tree(exampleSet, e, depth, splitCriterion)
        print("Accuracy     :", accuracy(tree, exampleSet))
        print("Size         :", tree.size)
        print("Maximum Depth:", tree.depth)
        print("First Feature:", tree.headnode.name)
    else:
        print("Incorrect validation type argument given.")
示例#10
0
def naive_bayes_cv(datasets, min_and_max):
    accuracies = []
    precisions = []
    recalls = []
    for i in range(5):
        train_data = mldata.ExampleSet()
        for j in range(1, 5):
            for index in range(len(datasets[(i + j) % 5])):
                train_data.append(datasets[(i + j) % 5][index])
        val_data = datasets[i]
        shuffle(train_data)
        shuffle(val_data)
        label_ratio, save_all_prob, save_all_threshold = Naive_Bayes.showme_dataset(
            train_data, NUM_BINS, M, min_and_max)
        accuracy, precision, recall = compute_test_results(
            label_ratio, save_all_prob, val_data)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        print(
            "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n" %
            (i + 1, accuracy, precision, recall))

    return accuracies, precisions, recalls
示例#11
0
def remove_near_zero_variance(data: mldata.ExampleSet,
                              cut_off: float = 0.1) -> mldata.ExampleSet:
    """Removes all discrete features that have "low" variance.
	Args:
		data: Collection of examples to pre-process.
		cut_off: Features with a variance below this value will be removed
			from the data.

	Returns:
		Filtered data that does not include near-zero variance features.
	"""
    def discrete_var(values: Sequence) -> float:
        encoding = {k: v for v, k in enumerate(set(values))}
        return statistics.variance(encoding[v] for v in values)

    discrete_exs = mlutil.get_feature_examples(data=data,
                                               feature_types={BINARY, NOMINAL},
                                               as_dict=True,
                                               index_as_key=True)
    if len(discrete_exs) == 0:
        return data
    near_zeros = {
        i
        for i, exs in discrete_exs.items() if discrete_var(exs) <= cut_off
    }
    enumerated_subset_schema = [(i, f) for i, f in enumerate(data.schema)
                                if i not in near_zeros]
    subset_schema = [feature for _, feature in enumerated_subset_schema]
    examples = []
    for ex in data:
        example = mldata.Example(subset_schema)
        example.features = [ex[i] for i, _ in enumerated_subset_schema]
        examples.append(example)
    subset = mldata.ExampleSet()
    subset.extend(examples)
    return subset
示例#12
0
def logreg(schema, exampleSet, validationType, constant, k=5):
    if validationType == 0:
        # 5-Fold Stratified CROSS VALIDATION
        folds = stratified_split_data(schema, exampleSet, k)
        print("-------", k, "- Fold Stratified Cross Validation --------")

        total_acc = []
        total_prec = []
        total_recal = []
        total_original_results = []
        total_predictions = []
        for i in range(k):
            #Create the buildSet
            buildSet = mldata.ExampleSet(schema)
            for j in range(k):
                if i != j:
                    for example in (folds[j]):
                        buildSet.append(example)
            print("Fold Iteration:", i)
            test = utils._convert_exampleset_to_dataframe(folds[i])
            class_idx = utils._get_class_idx(test)
            #classifier = NaiveBayes(buildSet, validationType, bins, Mestimate)
            classifier = LogisticRegression(buildSet, constant)
            predictions = classifier.predict(test)

            print("Calculating output of this fold.")
            original_results = []
            for l in range(len(test)):
                original_results.append(test.iloc[l, class_idx])
            TruePos = 0
            TrueNeg = 0
            FalsePos = 0
            FalseNeg = 0
            for m in range(len(predictions)):
                if predictions[m][1] == 1 and original_results[m] == 1:
                    TruePos += 1
                elif predictions[m][1] == 0 and original_results[m] == 0:
                    TrueNeg += 1
                elif predictions[m][1] == 1 and original_results[m] == 0:
                    FalsePos += 1
                elif predictions[m][1] == 0 and original_results[m] == 1:
                    FalseNeg += 1
                else:
                    print("YOU MESSED UP:", i)
            assert len(predictions) == (
                TrueNeg + TruePos + FalseNeg + FalsePos
            ), "...OH NO, Sum of results doesn't equal num of results..."

            total_acc.append((TrueNeg + TruePos) /
                             (TrueNeg + TruePos + FalseNeg + FalsePos))
            print("Error for fold: " +
                  str(1 - (TrueNeg + TruePos) /
                      (TrueNeg + TruePos + FalseNeg + FalsePos)))
            if TruePos + FalsePos > 0:
                total_prec.append((TruePos) / (TruePos + FalsePos))
            elif TruePos + FalsePos + FalseNeg == 0:
                total_prec.append(1)
            else:
                total_prec.append(0)
            if TruePos + FalseNeg > 0:
                total_recal.append((TruePos) / (TruePos + FalseNeg))
            elif TruePos + FalsePos + FalseNeg == 0:
                total_recal.append(1)
            else:
                total_recal.append(0)
            if i == 0:
                total_predictions = predictions
                total_original_results = original_results
            else:
                total_predictions = np.concatenate(
                    (total_predictions, predictions), axis=0)
                total_original_results = np.concatenate(
                    (total_original_results, original_results), axis=0)

        #after folds are done
        TPR = []
        FPR = []
        increment = 0.1
        threshold = 1.0
        while threshold >= 0:
            TP = 0
            FP = 0
            TN = 0
            FN = 0
            for i in range(0, len(total_predictions)):
                if total_predictions[i][
                        0] >= threshold and total_original_results[i] == 1:
                    TP += 1
                elif total_predictions[i][
                        0] >= threshold and total_original_results[i] == 0:
                    FP += 1
                elif total_predictions[i][
                        0] < threshold and total_original_results[i] == 1:
                    FN += 1
                elif total_predictions[i][
                        0] < threshold and total_original_results[i] == 0:
                    TN += 1
                else:
                    print("YOU MESSED UP:", i, total_predictions[i],
                          total_original_results[i])
            assert len(total_predictions) == (
                TN + TP + FN + FP), "...OH NO, pred doens't equal original..."

            TPR.append(TP / (TP + FN))
            FPR.append(FP / (FP + TN))
            threshold -= increment
        print("TPR: ", TPR)
        print("FPR: ", FPR)

        AUR = 0.0

        for trap in range(0, len(TPR) - 1):
            xDis = (FPR[trap + 1] - FPR[trap])
            yDis = (TPR[trap] + TPR[trap + 1]) / 2
            AUR += xDis * yDis

        if AUR < 0.5:
            print("1 - AUR used")
            AUR = 1.0 - AUR

        avg_acc = np.average(total_acc)
        avg_pre = np.average(total_prec)
        avg_rec = np.average(total_recal)

        std_acc = np.std(total_acc)
        std_pre = np.std(total_prec)
        std_rec = np.std(total_recal)

        print("===== Folds Complete =====")
        print("Average Accuracy   :", round(avg_acc, 3), round(std_acc, 3))
        print("Average Precision  :", round(avg_pre, 3), round(std_pre, 3))
        print("Average Recall     :", round(avg_rec, 3), round(std_rec, 3))
        print("Area Under ROC     :", round(AUR, 3))

    elif validationType == 1:
        print(
            "------- NO Cross Validation: Running on Full Example Set --------"
        )
        #NO CROSS VALIDATION
        total_acc = 0.0
        total_prec = 0.0
        total_recal = 0.0
        test = utils._convert_exampleset_to_dataframe(exampleSet)
        class_idx = utils._get_class_idx(test)
        #classifier = NaiveBayes(exampleSet, validationType, bins, Mestimate)
        classifier = LogisticRegression(exampleSet, constant)
        predictions = classifier.predict(test)

        print("Calculating output")
        original_results = []
        for l in range(len(test)):
            original_results.append(test.iloc[l, class_idx])
        TruePos = 0
        TrueNeg = 0
        FalsePos = 0
        FalseNeg = 0
        for m in range(len(predictions)):
            if predictions[m][1] == 1 and original_results[m] == 1:
                TruePos += 1
            elif predictions[m][1] == 0 and original_results[m] == 0:
                TrueNeg += 1
            elif predictions[m][1] == 1 and original_results[m] == 0:
                FalsePos += 1
            elif predictions[m][1] == 0 and original_results[m] == 1:
                FalseNeg += 1
            else:
                print("YOU MESSED UP:", i)
        assert len(predictions) == (
            TrueNeg + TruePos + FalseNeg + FalsePos
        ), "...OH NO, Sum of results doesn't equal num of results..."

        total_acc = (TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg +
                                           FalsePos)
        total_prec = (TruePos) / (TruePos + FalsePos)
        total_recal = (TruePos) / (TruePos + FalseNeg)

        #after folds are done
        TPR = []
        FPR = []
        increment = 0.1
        threshold = 1.0
        while threshold >= 0:
            TP = 0
            FP = 0
            TN = 0
            FN = 0
            for i in range(0, len(predictions)):
                if predictions[i][0] >= threshold and original_results[i] == 1:
                    TP += 1
                elif predictions[i][0] >= threshold and original_results[
                        i] == 0:
                    FP += 1
                elif predictions[i][0] < threshold and original_results[i] == 1:
                    FN += 1
                elif predictions[i][0] < threshold and original_results[i] == 0:
                    TN += 1
                else:
                    print("YOU MESSED UP:", i, predictions[i],
                          original_results[i])
            assert len(predictions) == (
                TN + TP + FN + FP), "...OH NO, pred doens't equal original..."

            TPR.append(TP / (TP + FN))
            FPR.append(FP / (FP + TN))
            threshold -= increment

        print("TPR: ", TPR)
        print("FPR: ", FPR)

        AUR = 0.0

        for trap in range(0, len(TPR) - 1):
            xDis = (FPR[trap + 1] - FPR[trap])
            yDis = (TPR[trap] + TPR[trap + 1]) / 2
            AUR += xDis * yDis

        if AUR < 0.5:
            print("1 - AUR used")
            AUR = 1.0 - AUR

        print("===== Run Complete =====")
        print("Average Accuracy   :", round(total_acc, 3))
        print("Average Precision  :", round(total_prec, 3))
        print("Average Recall     :", round(total_recal, 3))
        print("Area Under ROC     :", round(AUR, 3))

    else:
        print("Incorrect validation type argument given.")
示例#13
0
 def _partition_data(data: mldata.ExampleSet, feature: mldata.Feature,
                     test: Callable[[Any], bool]) -> Tuple:
     idx = mlutil.get_feature_index(data, feature)
     left_data = mldata.ExampleSet([e for e in data if test(e[idx])])
     right_data = mldata.ExampleSet([e for e in data if not test(e[idx])])
     return left_data, right_data
示例#14
0
文件: bag.py 项目: tmaidment/EECS440
 def _sample_dtree(self, dataset):
     newDataset = mldata.ExampleSet()
     for i in range(len(dataset)):
         samplingIdx = random.randint(0, len(dataset) - 1)
         newDataset.append(dataset[samplingIdx])
     return newDataset
示例#15
0
def cv(datasets):
    accuracies = []
    precisions = []
    recalls = []
    for i in range(5):
        train_data = mldata.ExampleSet()
        for j in range(1, 5):
            for index in range(len(datasets[(i + j) % 5])):
                train_data.append(datasets[(i + j) % 5][index])
        val_data = datasets[i]
        shuffle(train_data)
        shuffle(val_data)
        if (P != 0):
            for data in train_data:
                if (random.random() <= P):
                    if (data[-1] == True):
                        data[-1] = False
                    elif (data[-1] == False):
                        data[-1] = True
        if (ALGORITHM == 1):
            weight = 1 / len(train_data) * np.ones(len(train_data))
            weight = weight.reshape(-1, 1)
            alpha_list, label_list = build_tree_boosting.boosting(
                MAX_DEPTH, EPS, train_data, val_data, ENABLE_GAIN, ITER,
                weight)
            f_list = compute_f_list(alpha_list, label_list)
            accuracy, precision, recall = compute_test_results(
                val_data, f_list)
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            ROC_area = compute_ROC_area()
            print(
                "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n"
                % (accuracy, precision, recall, ROC_area))
        elif (ALGORITHM == 2):
            alpha_list, label_list = naive_gayes.naive_bayes(
                train_data, val_data, ITER, NUM_BINS, M)
            f_list = compute_f_list(alpha_list, label_list)
            accuracy, precision, recall = compute_test_results(
                val_data, f_list)
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            print(
                "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n"
                % (i + 1, accuracy, precision, recall))
        elif (ALGORITHM == 3):
            lg = logreg.Logistic_Regression(lambdaa=LAMBDA,
                                            training_data=train_data,
                                            iteration=100,
                                            learning_rate=LR,
                                            boosting=True)
            lg, alpha_list, label_list = update_lg(lg, val_data)
            f_list = compute_f_list(alpha_list, label_list)
            accuracy, precision, recall = compute_test_results(
                val_data, f_list)
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            print(
                "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n"
                % (i + 1, accuracy, precision, recall))

    return accuracies, precisions, recalls