示例#1
0
def main(dataset,
         data_path,
         use_cv,
         max_depth,
         use_info_gain: int,
         weights=None):
    # only relevant for when we're running the experiment
    partition_count = [3, 5, 7, 10] if use_info_gain < 0 else [1]
    data = mldata.parse_c45(dataset, data_path)
    #data = mldata.ExampleSet([e for i,e in enumerate(data) if i < 1000])
    if use_info_gain >= 1:
        split_criteria = metrics.info_gain
    elif use_info_gain == 0:
        split_criteria = metrics.gain_ratio
    else:
        split_criteria = metrics.stochastic_information_gain
    for z in partition_count:  # run the experiment
        if len(partition_count) > 1:
            print(f'\nrunning experiment with {z} partitions')
        learner = algorithm.ID3(max_depth=max_depth,
                                split_function=split_criteria,
                                partitions=z,
                                boost_weights=weights)
        if weights != None:
            return learner, data
        else:
            run(use_cv, data, learner)
    def __init__(self,
                 max_depth: int,
                 path: str,
                 criterion: str,
                 cv: bool = False):
        super().__init__()
        self.max_depth = max_depth
        self.criterion = criterion
        self.cv = cv

        # load data
        temp = path.split("/")
        file_base = temp[-1]

        # print(file_base, path)
        data = parse_c45(file_base, path)
        self.A = []
        self.X = []
        self.classes = []

        for index, column in enumerate(data.schema):
            if index == 0:
                continue
            if column.type == "CLASS":
                class_idx = index
            else:
                self.A.append((column.name, column.type))
        for sample in data:
            self.X.append(sample[1:class_idx] + sample[class_idx + 1:])
            self.classes.append(sample[class_idx])
        self.D = (self.X, self.classes)
        self.attr2idx = dict()
        for index, attr in enumerate(self.A):
            self.attr2idx[attr[0]] = index
        self.root = None
示例#3
0
def main():
    """
    run the decision tree with param given by user
    ----------
    """
    file_path, use_full_sample, max_depth, use_gain_ratio = sys.argv[1:5]
    # parse args
    [use_full_sample, max_depth, use_gain_ratio
     ] = [int(use_full_sample),
          int(max_depth),
          int(use_gain_ratio)]
    # parse dataset
    raw_parsed = mldata.parse_c45(file_path.split(os.sep)[-1], file_path)
    examples = np.array(raw_parsed, dtype=object)
    samples = examples[:, 1:-1]
    targets = examples[:, -1]
    # grow a huge tree (gurantees to cover a full tree) if input specifies 0 in max_depth
    if max_depth == 0:
        max_depth = int(1e9)
    # run on full sample
    if use_full_sample:
        dt = ID3(max_depth, use_gain_ratio)
        dt.fit(samples, targets)
    else:
        dt = ID3(max_depth, use_gain_ratio)
        print("Accuracy: ", str(k_fold_cv(dt, examples, K)))
    print("Size: ", str(dt.size))
    print("Maximum Depth: ", str(dt.max_depth))
    print("First Feature: ",
          str(raw_parsed.examples[0].schema.features[dt.attr_idx + 1].name))
示例#4
0
文件: bag.py 项目: sashrobo0918/CWRU
def get_dataset(file_path):
    """
    parse the dataset stored in the input file path
    ----------
    file_path : String
        the path to the dataset
    """
    raw_parsed = mldata.parse_c45(file_path.split(os.sep)[-1], file_path)
    return np.array(raw_parsed, dtype=object)
示例#5
0
def read_data(path, n_bin=3):
    prob_name = path.split('/')[-1]
    datafile = path + '/' + prob_name + '.data'
    # data = np.loadtxt(datafile, delimiter=',', dtype=str)
    data = parse_c45(prob_name, path)
    data = np.asarray(data.to_float())
    # print(data)
    X = data[:, 1:-1]
    X = process(X, prob_name, n_bin)
    y = data[:, -1].astype(int)
    return X, y
示例#6
0
def main(problem_name, max_depth=0):
    example_set = md.parse_c45(problem_name, '../data')
    random.seed(12345)
    random.shuffle(example_set)
    training_set = example_set[:4 * len(example_set)/5]
    validation_set = example_set[4 * len(example_set)/5:]
    feature_indices = [i for i in range(1, len(example_set.schema.features[1:-1]))]
    dtree = DecisionTree(training_set, example_set.schema, feature_indices, max_depth=max_depth)
    accuracy = dtree.get_accuracy(validation_set)
    print "Accuracy: {}".format(accuracy)
    tree_size, tree_depth = dtree.get_size_and_depth()
    print "Size: {}".format(tree_size)
    print "Maximum Depth: {}".format(tree_depth)
示例#7
0
def get_svm_inputs():
    parser = argparse.ArgumentParser(description="SVM Classifier")
    parser.add_argument('data_file_name')
    parser.add_argument('c', type=float)
    args = parser.parse_args()

    if args.data_file_name.endswith(".mat"):
        data_dict = scipy.io.loadmat(DATA_DIRECTORY + args.data_file_name)
        data_set_key = args.data_file_name.replace('.mat', '')
        data_set = (data_dict[data_set_key]).astype(float)
    else:
        example_set = parse_c45(args.data_file_name, DATA_DIRECTORY)
        data_set = np.array(example_set.to_float())
    return normalize(data_set), args.c
示例#8
0
文件: bag.py 项目: tmaidment/EECS440
def bag(datapath, validationType, algo, iterations):
    # TODO: Do cross-val
    path = datapath
    if (os.path.isdir(path)):
        file_base = next(el for el in reversed(path.split('/')) if el)
        exampleSet = mldata.parse_c45(file_base, path)
        schema = exampleSet.schema
    bag = Bag(exampleSet, 1, algo, 10)
    predictions = bag.predict(bag.data)
    good = 0
    if algo == 'dtree':
        for i in range(len(predictions)):
            label = np.asarray(bag.data.to_float())[:, -1]
            if predictions[i, 1] == label[i]:
                good += 1
    else:
        for i in range(len(predictions)):
            if predictions[i,
                           1] == bag.data.iloc[i,
                                               len(bag.data.iloc[0, :]) - 1]:
                good += 1
    print(good / len(predictions))
示例#9
0
def main():
    #Error value processing
    if (ENABLE_VAL != 0 and ENABLE_VAL != 1):
        raise ValueError("ENABLE_VAL should be 0 or 1")
    if (ENABLE_GAIN != 0 and ENABLE_GAIN != 1):
        raise ValueError("ENABLE_GAIN should be 0 or 1")
    if (MAX_DEPTH < 0):
        raise ValueError("MAX_DEPTH should be nonnegative")
    elif (type(MAX_DEPTH) != int):
        raise TypeError("MAX_DEPTH should be an integer")

    #Read data
    path_name = DATA_PATH.rpartition('/')
    path = path_name[0]
    name = path_name[2]
    full_dataset = mldata.parse_c45(name, path)

    #Build tree and output all results
    if (ENABLE_VAL == 1):
        tree = build_tree.build_DecisionTree(MAX_DEPTH, EPS, full_dataset,
                                             ENABLE_GAIN)
        size = tree.get_tree_size()
        max_depth = tree.get_tree_depth()
        first_feature_index = tree.get_root().get_attriIndex()
        first_feature = full_dataset.schema.features[first_feature_index].name
        acc = tree.classify_dataset(full_dataset)
        print(
            'Accuracy: %.4f\n\nSize: %d\n\nMaximum Depth: %d\n\nFirst Feature: %s'
            % (acc, size, max_depth, first_feature))
    elif (ENABLE_VAL == 0):
        datasets = fold_5_cv(full_dataset)
        trees, sizes, first_features, accs, max_depths = build_trees(datasets)
        acc_sum = 0
        for i in range(5):
            acc_sum += accs[i]
        acc = acc_sum / 5
        print('\nAverage Accuracy: %.4f' % acc)
示例#10
0
            if (np.dot(self.w, val[index, 1:-1]) + self.b) > 0:
                pred[index][1] = 1
            else:
                pred[index][1] = 0
        #print(pred)
        return pred.astype(int)

    def ensemblePrediction(self, data):
        prediction = self.predict(data)
        return prediction[:, 1]


if __name__ == '__main__':
    path = '../voting'
    data = utils._convert_exampleset_to_dataframe(
        mldata.parse_c45(path.split('/')[-1], path))
    logreg = LogisticRegression(data, constant=0, weights=None)
    print('Final Weights', logreg.w)
    '''
    parser = argparse.ArgumentParser(description='Logistic Regression Implementation')
    parser.add_argument('options', nargs=3, help="The options as specified by the prompt.")
    args = parser.parse_args()

    path = str(args.options[0])
    if(os.path.isdir(path)):
        file_base = next(el for el in reversed(path.split('/')) if el)
        exampleSet = mldata.parse_c45(file_base, path)
        schema = exampleSet.schema
        print("Loading dataset:", file_base)
    else:
        assert 'Dataset input not found!'
示例#11
0
    '''
    path = '../spam'
    x = mldata.parse_c45(path.split('/')[-1], path)
    dtree(x, validationType=0, depth=5, splitCriterion=1)
    '''

    parser = argparse.ArgumentParser(
        description='ID3 Decision Tree Implementation')
    parser.add_argument('options',
                        nargs=4,
                        help="The options as specified by the prompt.")
    args = parser.parse_args()

    path = str(args.options[0])
    if (os.path.isdir(path)):
        exampleSet = mldata.parse_c45(
            next(el for el in reversed(path.split('/')) if el), path)
        print("Loading dataset:",
              next(el for el in reversed(path.split('/')) if el))
    else:
        assert 'Dataset input not found!'

    xval = int(args.options[1])
    if (xval == 0):
        print("Cross Validation enabled")
    elif (xval == 1):
        print("Cross Validation disabled")
    else:
        assert 'Unable to determine cross validation flag.'

    maxdepth = int(args.options[2])
    if (maxdepth > 0):
示例#12
0
	def parse(filepath):
		return exset = parse_c45(filepath)
示例#13
0
                return [[classifier], [1.0]] # perfect classifier, or complete crap

            #correct = np.equal(pred[:,1], truth) # rounded (0, 1) predictions

            classifier_weights.append((1/2) * np.log((1-error)/error))

            truth_scale = (truth * 2) - 1
            pred_scale = (pred * 2) - 1

            #update the weights
            next_weight = data_weights[-1] * np.exp(classifier_weights[-1] * np.multiply(truth_scale, pred_scale))
            next_weight /= np.sum(next_weight)
            data_weights.append(next_weight)

            #print(len(classifiers), len(classifier_weights), len(data_weights[:-1]))

        #weights = data_weights[:-1]
        return classifiers, classifier_weights #remove the last weight

    def squared_error(self, weight, pred, truth):
        #error = 0
        error = np.sum(np.multiply(weight, np.power(np.subtract(pred, truth), 2)), axis=0) # need to figure out axis
        #for idx, entry in enumerate(pred):
        #    error += (pred[idx] - truth[idx])^2
        return error
        
if __name__ == '__main__':
    path = '../voting'
    data = utils._convert_exampleset_to_dataframe(mldata.parse_c45(path.split('/')[-1], path))
    booster = boosting(path, data, 'logreg', 2)
    out = booster.predict(data)
示例#14
0
    for i in range(len(attrs)):
        for j in range(len(attrs)):
            if (i != j and attrs[i][0] == attrs[j][0]):
                split1 = attrs[i][1]
                split2 = attrs[j][1]
                attrs.remove(attrs[j])
                attrs[i][1] = [
                    next(elem for elem in split1 if elem is not None),
                    next(elem for elem in split2 if elem is not None)
                ]
                attrs[i][1].sort()
                return attrs
    return attrs


if __name__ == '__main__':
    # This code is for testing purposes.
    x = mldata.parse_c45('voting', '../voting')
    #e = ns.EntropySelector(x)
    #dtree = build_tree(x, e, 0)
    #print(x[0].to_float())
    #print(dtree.eval(x[0]).attr_float)
    #print(x[3])
    #print(dtree.eval(x[3]).attr_float)
    #print(_combine_terms([[1,4.5],[3,(None,1234)], [6, "AY"], [3,(54,None)], [4, 4.0]]))
    #print(e.get_split_attr({2:4.0}, 0))
    #attr_idx = -1
    #attr_idx, attr_float = e.get_split_attr({2:4.0}, 0)
    #print(attr_idx, attr_float)
    #node = _init_node(x, None, attr_idx, attr_float, attr_idx)
示例#15
0
        pos_mu, neg_mu = summary['pos_mean'], summary['neg_mean']
        pos_sig2, neg_sig2 = summary['pos_variance'], summary['neg_variance']
        prob_pos = 1 / (2 * pi * pos_sig2)**0.5 * exp(-0.5 * (feature_value - pos_mu)**2 / pos_sig2)
        prob_neg = 1 / (2 * pi * neg_sig2)**0.5 * exp(-0.5 * (feature_value - neg_mu)**2 / neg_sig2)
        return prob_pos, prob_neg

    def get_smoothing_estimate(self, number_of_values):
        """
        Returns a Laplace smoothing estimate if m_estimate is negative
        :param number_of_values:
        :return:
        """
        if self.m_estimate < 0:
            return number_of_values
        else:
            return self.m_estimate

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="A Naive-Bayes Classifier Implementation.")
    parser.add_argument('data_file_name')
    parser.add_argument('m_estimate', type=float)
    args = parser.parse_args()

    example_set = parse_c45(args.data_file_name, DATA_DIRECTORY)
    data_set = np.array(example_set.to_float())
    for feature in example_set.schema[1:-1]:
        if feature.type == 'NOMINAL':
            feature.values = tuple([feature.to_float(value) for value in feature.values])
    normalize(data_set, example_set.schema)
    results = NaiveBayes.solve(data_set, example_set.schema[1:-1], args.m_estimate)
    print_performance(results)
示例#16
0
def main():
    #Error value processing
    if (ENABLE_VAL != 0 and ENABLE_VAL != 1):
        raise ValueError("ENABLE_VAL should be 0 or 1")
    if (NUM_BINS < 2):
        raise ValueError("NUM_BINS should be greater that 2")
    elif (type(NUM_BINS) != int):
        raise TypeError("NUM_BINS should be an integer")

    #Read data
    path_name = DATA_PATH.rpartition('/')
    path = path_name[0]
    name = path_name[2]
    full_dataset = mldata.parse_c45(name, path)

    #Calculate the min and man values of each attribute, in order to decide the boundaries of k-bins
    min_and_max = []
    np_full_dataset = np.array(full_dataset)
    attr_length = len(full_dataset.schema) - 2
    min_and_max = np.zeros((attr_length, 2))
    for i in range(1, attr_length + 1):
        if (full_dataset.schema[i].type == "CONTINUOUS"):
            row = np_full_dataset[:, i].astype(float)
            max = np.amax(row)
            min = np.amin(row)
            min_and_max[i - 1][0] = min
            min_and_max[i - 1][1] = max
    min_and_max = np.transpose(min_and_max)

    #Build models
    if (ENABLE_VAL == 1):
        label_ratio, save_all_prob, save_all_threshold = Naive_Bayes.showme_dataset(
            full_dataset, NUM_BINS, M, min_and_max)
        accuracy, precision, recall = compute_test_results(
            label_ratio, save_all_prob, full_dataset)
        ROC_area = compute_ROC_area()
        print(
            "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n"
            % (accuracy, precision, recall, ROC_area))
    elif (ENABLE_VAL == 0):
        datasets = fold_5_cv(full_dataset)
        accuracies, precisions, recalls = naive_bayes_cv(datasets, min_and_max)
        avg_accuracy = 0
        avg_precision = 0
        avg_recall = 0
        std_accuracy = 0
        std_precision = 0
        std_recall = 0
        for i in range(5):
            avg_accuracy += accuracies[i]
            avg_precision += precisions[i]
            avg_recall += recalls[i]
        avg_accuracy = avg_accuracy / 5
        avg_precision = avg_precision / 5
        avg_recall = avg_recall / 5
        for i in range(5):
            std_accuracy += (accuracies[i] - avg_accuracy)**2
            std_precision += (precisions[i] - avg_precision)**2
            std_recall += (recalls[i] - avg_recall)**2
        std_accuracy = (std_accuracy / 5)**0.5
        std_precision = (std_precision / 5)**0.5
        std_recall = (std_recall / 5)**0.5
        ROC_area = compute_ROC_area()

        print(
            "Accuracy: %.3f %.3f\nPrecision: %.3f %.3f\nRecall: %.3f %.3f\nArea under ROC: %.3f\n"
            % (avg_accuracy, std_accuracy, avg_precision, std_precision,
               avg_recall, std_recall, ROC_area))
示例#17
0
def calcAve(ar):
    total = 0
    for i in range(ar.shape[0]):
        total = total + ar[i]
    return total / ar.shape[0]


# In[ ]:

path = input('Enter the path to the data:')
cv = int(input('Cross Validation? 0 for cv, 1 for full sample'))
numbin = int(input('Enter the number of bins for any continuous feature:'))
mvalue = int(input('Enter the value of m for the m-estimate:'))
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
data = np.array(parse_c45(path).to_float())
acc = np.array([])
prec = np.array([])
recall = np.array([])
roc = np.array([])
if cv == 1:
    Bayes = naiveBayes(mvalue, numbin, data)
    pred = Bayes[0]
    confi = Bayes[1]
    acc = np.append(acc, calcAccPreRec(pred, data)[0])
    prec = np.append(prec, calcAccPreRec(pred, data)[1])
    recall = np.append(recall, calcAccPreRec(pred, data)[2])
    roc = np.append(roc, rocArea(confi, data[:, -1]))
else:
    cvdata = stratCrossValid(data)
    for i in range(5):
def read_data(path):
    pathArray = path.split('\\')
    fileName = pathArray[len(pathArray) - 1]
    return mldata.parse_c45(fileName, path)
示例#19
0
def boost(path, option, solver_type, num_iters): 
    path = path.replace("\\", "/")
    file_base = path.split('/')[-1]
    rootdir=path

    epsilon_thread = 0.00000001

    data = mldata.parse_c45(file_base, rootdir)
    n_bin = 1
    cross_validation = False
    if option == 0:
        n_bin = 5
        cross_validation = True

    data = np.asarray(data.to_float())
    X_data = data[:, 1:-1]
    X_data = preprocess.process(X_data, file_base, n_bin)
    y_data = data[:, -1].astype(int)
    # print(len(X_data))
    # partition the data into multiple dataset,
    folds = util.n_fold(len(data), n_bin)

    # nbayes:
    posi_num = [{} for i in range(len(X_data[0]))]
    nega_num = [{} for i in range(len(X_data[0]))]
            
    for i, d in enumerate(posi_num):
        for attr in np.unique(X_data[:, i]):
            posi_num[i][attr] = 0
                    
    for i, d in enumerate(nega_num):
        for attr in np.unique(X_data[:, i]):
            nega_num[i][attr] = 0

    AUC_y = []
    pred_AUC_y = []
    acc = []
    prec = []
    rec = []
    # training and evaluating
                
    for i in range(n_bin):
        if solver_type == "dtree":
            tree = ID3DecisionTree(1, path, "gain", cross_validation)
            x_train, y_train, x_test, y_test = tree.create_for_train(n_bin, i)
            train_size = len(x_train)
            wboost = np.ones((train_size, 1)).astype(float)/train_size
            alphas = []
            epsilons = []
            forest = []
            for iter_ in range(num_iters): 
                tree = ID3DecisionTree(1, path, "gain", cross_validation)
                x_train, y_train, x_test, y_test = tree.create_for_train(n_bin, i)
                D_train = (x_train, y_train)
                wboost, epsilon, alpha = tree.boosttrain(D_train, wboost, epsilon_thread)
                forest.append(tree)
                epsilons.append(epsilon)
                if epsilon == 0:
                    alphas = [0] * len(alphas)
                    alphas.append(1)
                    break
                elif epsilon <= epsilon_thread or epsilon >= 0.5:
                    alphas.append(alpha)
                    break
                else:
                    alphas.append(alpha)
                # y_pred = tree.test(x_test)
            result = []
            for i in range(len(forest)):
                y_predB = forest[i].test(x_test)
                y_pred = np.array(y_predB)
                y_pred[y_pred==False] = 0
                y_pred[y_pred==True] = 1
                result.append(y_pred)
            alphas = np.array(alphas)
            alphas = alphas/np.sum(alphas)
            y_pred = alphas.dot(np.array(result))
            y_pred[y_pred<0.5] = 0
            y_pred[y_pred>=0.5] = 1
            y_test = np.array(y_test)
            y_test[y_test<0.5] = 0
            y_test[y_test>=0.5] = 1
            AUC_y.extend(y_test)
            pred_AUC_y.extend(y_pred)
            _acc, _prec, _rec = util.cal_APR(y_pred, y_test)
            if cross_validation:
                util.report_cross(_acc, _prec, _rec)
            acc.append(_acc)
            prec.append(_prec)
            rec.append(_rec)
        elif solver_type == "nbayes":
            m_etimate = 0.1
            x_train, y_train, x_test, y_test = create_for_train(X_data, y_data, folds, n_bin, i)
            train_size = len(x_train)
            wboost = np.ones((train_size, 1)).astype(float)/train_size
            alphas = []
            epsilons = []
            pre_ps = []
            posi_ps = []
            nega_ps = []
            for iter_ in range(num_iters): 
                pre_p, posi_p, nega_p, epsilon, alpha, wboost = nbayes.boosttrain_bayes(x_train, y_train, m_etimate, posi_num, nega_num, wboost, epsilon_thread)
                epsilons.append(epsilon)
                pre_ps.append(pre_p)
                posi_ps.append(posi_p)
                nega_ps.append(nega_p)
                if epsilon == 0:
                    alphas = [0] * len(alphas)
                    alphas.append(1)
                    break
                elif epsilon <= epsilon_thread or epsilon >= 0.5:
                    alphas.append(alpha)
                    break
                else:
                    alphas.append(alpha)
            result = []
            for i in range(len(pre_ps)):
                y_predB = nbayes.pred(x_test, pre_ps[i], posi_ps[i], nega_ps[i])
                y_pred = []
                for i in y_predB:
                    if i[0] > i[1]:
                        y_pred.append(0)
                    else:
                        y_pred.append(1)
                y_pred = np.array(y_pred)
                result.append(y_pred)
            alphas = np.array(alphas)
            alphas = alphas/np.sum(alphas)
            y_pred = alphas.dot(np.array(result))
            y_pred[y_pred<0.5] = 0
            y_pred[y_pred>=0.5] = 1
            AUC_y.extend(y_test)
            pred_AUC_y.extend(y_pred)
            _acc, _prec, _rec = util.cal_APR(y_pred, y_test)
            if cross_validation:
                util.report_cross(_acc, _prec, _rec)
            acc.append(_acc)
            prec.append(_prec)
            rec.append(_rec)
        elif solver_type == "logreg":
            # train
            x_train, y_train, x_test, y_test = create_for_train(X_data, y_data, folds, n_bin, i)
            train_size = len(x_train)
            wboost = np.ones((train_size, 1)).astype(float)/train_size
            alphas = []
            epsilons = []
            weights = []
            for iter_ in range(num_iters): 
                weight, epsilon, alpha, wboost = logreg.boostLR(x_train, y_train, wboost, epsilon_thread, max_iters=500, lbd=0.1)
                epsilons.append(epsilon)
                weights.append(weight)
                if epsilon == 0:
                    alphas = [0] * len(alphas)
                    alphas.append(1)
                    break
                elif epsilon <= epsilon_thread or epsilon >= 0.5:
                    alphas.append(alpha)
                    break
                else:
                    alphas.append(alpha)
            result = []
            for i in range(len(weights)):
                result.append(logreg.pred(x_test, weights[i]))
            alphas = np.array(alphas)
            alphas = alphas/np.sum(alphas)
            y_pred = alphas.dot(np.array(result))
            y_pred[y_pred<0.5] = 0
            y_pred[y_pred>=0.5] = 1
            AUC_y.extend(y_test)
            pred_AUC_y.extend(y_pred)
            _acc, _prec, _rec = logreg.cal_LR_APR(y_pred, y_test)
            if cross_validation:
                util.report_cross(_acc, _prec, _rec)
            acc.append(_acc)
            prec.append(_prec)
            rec.append(_rec)
        else:
            return 
    roc_score = logreg.cal_AUC(AUC_y, pred_AUC_y)
    util.report(acc, prec, rec, roc_score)
示例#20
0
    def _is_binary(self, type):
        pattern = re.compile("BINARY*")
        return bool(pattern.match(type))

    def _is_class(self, type):
        pattern = re.compile("CLASS*")
        return bool(pattern.match(type))
        


        


class TypeError(Exception):
    pass



class FunctionNotSupported(Exception):
    pass


if __name__ == '__main__':
    # TESTING --- Delete for finished product
    import sys
    sys.path.append("..")
    import mldata
    x = mldata.parse_c45("spam", "../spam")
    e = EntropySelector(x)
    print(e.get_split_attr({6: 0.0}, 0))
示例#21
0
def main():
    #Error value processing
    if (ENABLE_VAL != 0 and ENABLE_VAL != 1):
        raise ValueError("ENABLE_VAL must be 0 or 1")
    if (ALGORITHM != 1 and ALGORITHM != 2 and ALGORITHM != 3):
        raise ValueError("ALGORITHM must be 1 or 2 or 3")
    if (ITER <= 0):
        raise ValueError("ITER must be positive")
    elif (type(ITER) != int):
        raise ValueError("ITER must be an integer")

    #Read data
    path_name = DATA_PATH.rpartition('/')
    path = path_name[0]
    name = path_name[2]
    full_dataset = mldata.parse_c45(name, path)

    #Build models
    if (ENABLE_VAL == 1):
        if (ALGORITHM == 1):
            weight = 1 / len(full_dataset) * np.ones(len(full_dataset))
            weight = weight.reshape(-1, 1)
            alpha_list, label_list = build_tree_boosting.boosting(
                MAX_DEPTH, EPS, full_dataset, full_dataset, ENABLE_GAIN, ITER,
                weight)
            f_list = compute_f_list(alpha_list, label_list)
            accuracy, precision, recall = compute_test_results(
                full_dataset, f_list)
            ROC_area = compute_ROC_area()
            print(
                "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n"
                % (accuracy, precision, recall, ROC_area))
        elif (ALGORITHM == 2):
            alpha_list, label_list = naive_gayes.naive_bayes(
                full_dataset, full_dataset, ITER, NUM_BINS, M)
            f_list = compute_f_list(alpha_list, label_list)
            accuracy, precision, recall = compute_test_results(
                full_dataset, f_list)
            ROC_area = compute_ROC_area()
            print(
                "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n"
                % (accuracy, precision, recall, ROC_area))
        elif (ALGORITHM == 3):
            lg = logreg.Logistic_Regression(lambdaa=LAMBDA,
                                            training_data=full_dataset,
                                            iteration=1,
                                            learning_rate=LR,
                                            boosting=True)
            lg, alpha_list, label_list = update_lg(lg, full_dataset)
            f_list = compute_f_list(alpha_list, label_list)
            accuracy, precision, recall = compute_test_results(
                full_dataset, f_list)
            ROC_area = compute_ROC_area()
            print(
                "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n"
                % (accuracy, precision, recall, ROC_area))
    elif (ENABLE_VAL == 0):
        datasets = fold_5_cv(full_dataset)
        accuracies, precisions, recalls = cv(datasets)
        avg_accuracy = 0
        avg_precision = 0
        avg_recall = 0
        std_accuracy = 0
        std_precision = 0
        std_recall = 0
        for i in range(5):
            avg_accuracy += accuracies[i]
            avg_precision += precisions[i]
            avg_recall += recalls[i]
        avg_accuracy = avg_accuracy / 5
        avg_precision = avg_precision / 5
        avg_recall = avg_recall / 5
        for i in range(5):
            std_accuracy += (accuracies[i] - avg_accuracy)**2
            std_precision += (precisions[i] - avg_precision)**2
            std_recall += (recalls[i] - avg_recall)**2
        std_accuracy = (std_accuracy / 5)**0.5
        std_precision = (std_precision / 5)**0.5
        std_recall = (std_recall / 5)**0.5
        ROC_area = compute_ROC_area()

        print(
            "Accuracy: %.3f %.3f\nPrecision: %.3f %.3f\nRecall: %.3f %.3f\nArea under ROC: %.3f\n"
            % (avg_accuracy, std_accuracy, avg_precision, std_precision,
               avg_recall, std_recall, ROC_area))
示例#22
0
def load_data():
    path_name = DATA_PATH.rpartition('/')
    path = path_name[0]
    name = path_name[2]
    full_dataset = mldata.parse_c45(name, path)
    return ExampleSet(full_dataset)
示例#23
0
# In[ ]:


def calcAve(ar):
    total = 0
    for i in range(ar.shape[0]):
        total = total + ar[i]
    return total / ar.shape[0]


# In[ ]:

path = input('Enter the path to the data:')
cv = int(input('Cross Validation? 0 for cv, 1 for full sample'))
lamda = int(input('Enter the value of lamda:'))

print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
dataType = []
data = parse_c45(path)
for i in data.schema:
    dataType.append(i.type)
data = np.array(data.to_float())

for k in range(len(dataType)):
    if (dataType[k] == 'NOMINAL'):
        for j in range(data.shape[1]):
            data[j, k] += 1.0

logReg_Cross(data, lamda, cv)
示例#24
0
from mldata import parse_c45
from math import log2

print(' ')
dataset = parse_c45("example")
print(dataset.schema[4].type)
print(dataset.schema[4].values)
print(dataset[1])
print(set(dataset[1]))

# def entropy(p)
#     summary = 0
#     #H(X)=-sum(px*log2(px))
#     for px in p:
#         px = px/sum(p)
#         if p != 0:
#             summary += px * log2(px,2)

#     reture (summary* -1)

# def informationGain(data, x=None)
#     #IG = H(y) - H(y|x)
#     summary = 0
#     for i in x:
#         summary += sum(i)/sum(d)*entropy(i)

#     ig = entropy(data)- summary

#     reture ig
示例#25
0
        return prob_pos, prob_neg

    def get_smoothing_estimate(self, number_of_values):
        """
        Returns a Laplace smoothing estimate if m_estimate is negative
        :param number_of_values:
        :return:
        """
        if self.m_estimate < 0:
            return number_of_values
        else:
            return self.m_estimate


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="A Naive-Bayes Classifier Implementation.")
    parser.add_argument('data_file_name')
    parser.add_argument('m_estimate', type=float)
    args = parser.parse_args()

    example_set = parse_c45(args.data_file_name, DATA_DIRECTORY)
    data_set = np.array(example_set.to_float())
    for feature in example_set.schema[1:-1]:
        if feature.type == 'NOMINAL':
            feature.values = tuple(
                [feature.to_float(value) for value in feature.values])
    normalize(data_set, example_set.schema)
    results = NaiveBayes.solve(data_set, example_set.schema[1:-1],
                               args.m_estimate)
    print_performance(results)
示例#26
0
import sys
import numpy as np
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf

# CONFIGURE HYPERPARAMETERS
np.random.seed(12345)
tf.random.set_seed(12345)

# Get command line argument for data location
argument_list = sys.argv[1:]
path = str(argument_list[0])
filename = os.path.basename(path)
filedir = path.replace(filename, '')
data = parse_c45(filename, filedir)

# Define epsilon value and type of noise
epsilon = float(argument_list[1])
noise_type = argument_list[2]

# Convert c45 data to DataFrame and create folds
unprocessed_df = data_to_dataframe(data)
attr_dict = create_attr_dict(data.schema)
df_whole, _ = process_data(unprocessed_df, attr_dict)
folds = create_folds(df_whole)

# Create a DataFrame to store important metrics
metrics_df = pd.DataFrame(columns=['fold', 'accuracy', 'precision', 'recall'])
metrics = []
示例#27
0
#!/usr/bin/env python
# coding: utf-8

# In[2]:

import os
import math
import operator
import numpy as np
import random
from mldata import parse_c45

data = np.array(parse_c45('voting').to_float())
data2 = np.array(parse_c45('spam').to_float())
data3 = np.array(parse_c45('volcanoes').to_float())

# In[1]:


def stratCrossValid(
    data
):  # stratified 5-fold-validation for both discrete and continuous cases
    subset0 = []
    subset1 = []
    fold1 = []
    fold2 = []
    fold3 = []
    fold4 = []
    fold5 = []
    for i in range(0, len(data)):
        if 1.0 == data[i, -1]:
示例#28
0
    data = mldata.parse_c45(path.split('/')[-1], path)
    logreg = logreg(data, validationType=0, constant=0)
    print('Final Weights', logreg.w)
    '''

    parser = argparse.ArgumentParser(
        description='Logistic Regression Implementation')
    parser.add_argument('options',
                        nargs=3,
                        help="The options as specified by the prompt.")
    args = parser.parse_args()

    path = str(args.options[0])
    if (os.path.isdir(path)):
        file_base = next(el for el in reversed(path.split('/')) if el)
        exampleSet = mldata.parse_c45(file_base, path)
        schema = exampleSet.schema
        print("Loading dataset:", file_base)
    else:
        assert 'Dataset input not found!'

    xval = int(args.options[1])
    if (xval == 0):
        print("Cross Validation enabled")
    elif (xval == 1):
        print("Cross Validation disabled")
    else:
        assert 'Unable to determine cross validation flag.'

    constant = int(args.options[2])
    if (constant >= 0):