def main(dataset, data_path, use_cv, max_depth, use_info_gain: int, weights=None): # only relevant for when we're running the experiment partition_count = [3, 5, 7, 10] if use_info_gain < 0 else [1] data = mldata.parse_c45(dataset, data_path) #data = mldata.ExampleSet([e for i,e in enumerate(data) if i < 1000]) if use_info_gain >= 1: split_criteria = metrics.info_gain elif use_info_gain == 0: split_criteria = metrics.gain_ratio else: split_criteria = metrics.stochastic_information_gain for z in partition_count: # run the experiment if len(partition_count) > 1: print(f'\nrunning experiment with {z} partitions') learner = algorithm.ID3(max_depth=max_depth, split_function=split_criteria, partitions=z, boost_weights=weights) if weights != None: return learner, data else: run(use_cv, data, learner)
def __init__(self, max_depth: int, path: str, criterion: str, cv: bool = False): super().__init__() self.max_depth = max_depth self.criterion = criterion self.cv = cv # load data temp = path.split("/") file_base = temp[-1] # print(file_base, path) data = parse_c45(file_base, path) self.A = [] self.X = [] self.classes = [] for index, column in enumerate(data.schema): if index == 0: continue if column.type == "CLASS": class_idx = index else: self.A.append((column.name, column.type)) for sample in data: self.X.append(sample[1:class_idx] + sample[class_idx + 1:]) self.classes.append(sample[class_idx]) self.D = (self.X, self.classes) self.attr2idx = dict() for index, attr in enumerate(self.A): self.attr2idx[attr[0]] = index self.root = None
def main(): """ run the decision tree with param given by user ---------- """ file_path, use_full_sample, max_depth, use_gain_ratio = sys.argv[1:5] # parse args [use_full_sample, max_depth, use_gain_ratio ] = [int(use_full_sample), int(max_depth), int(use_gain_ratio)] # parse dataset raw_parsed = mldata.parse_c45(file_path.split(os.sep)[-1], file_path) examples = np.array(raw_parsed, dtype=object) samples = examples[:, 1:-1] targets = examples[:, -1] # grow a huge tree (gurantees to cover a full tree) if input specifies 0 in max_depth if max_depth == 0: max_depth = int(1e9) # run on full sample if use_full_sample: dt = ID3(max_depth, use_gain_ratio) dt.fit(samples, targets) else: dt = ID3(max_depth, use_gain_ratio) print("Accuracy: ", str(k_fold_cv(dt, examples, K))) print("Size: ", str(dt.size)) print("Maximum Depth: ", str(dt.max_depth)) print("First Feature: ", str(raw_parsed.examples[0].schema.features[dt.attr_idx + 1].name))
def get_dataset(file_path): """ parse the dataset stored in the input file path ---------- file_path : String the path to the dataset """ raw_parsed = mldata.parse_c45(file_path.split(os.sep)[-1], file_path) return np.array(raw_parsed, dtype=object)
def read_data(path, n_bin=3): prob_name = path.split('/')[-1] datafile = path + '/' + prob_name + '.data' # data = np.loadtxt(datafile, delimiter=',', dtype=str) data = parse_c45(prob_name, path) data = np.asarray(data.to_float()) # print(data) X = data[:, 1:-1] X = process(X, prob_name, n_bin) y = data[:, -1].astype(int) return X, y
def main(problem_name, max_depth=0): example_set = md.parse_c45(problem_name, '../data') random.seed(12345) random.shuffle(example_set) training_set = example_set[:4 * len(example_set)/5] validation_set = example_set[4 * len(example_set)/5:] feature_indices = [i for i in range(1, len(example_set.schema.features[1:-1]))] dtree = DecisionTree(training_set, example_set.schema, feature_indices, max_depth=max_depth) accuracy = dtree.get_accuracy(validation_set) print "Accuracy: {}".format(accuracy) tree_size, tree_depth = dtree.get_size_and_depth() print "Size: {}".format(tree_size) print "Maximum Depth: {}".format(tree_depth)
def get_svm_inputs(): parser = argparse.ArgumentParser(description="SVM Classifier") parser.add_argument('data_file_name') parser.add_argument('c', type=float) args = parser.parse_args() if args.data_file_name.endswith(".mat"): data_dict = scipy.io.loadmat(DATA_DIRECTORY + args.data_file_name) data_set_key = args.data_file_name.replace('.mat', '') data_set = (data_dict[data_set_key]).astype(float) else: example_set = parse_c45(args.data_file_name, DATA_DIRECTORY) data_set = np.array(example_set.to_float()) return normalize(data_set), args.c
def bag(datapath, validationType, algo, iterations): # TODO: Do cross-val path = datapath if (os.path.isdir(path)): file_base = next(el for el in reversed(path.split('/')) if el) exampleSet = mldata.parse_c45(file_base, path) schema = exampleSet.schema bag = Bag(exampleSet, 1, algo, 10) predictions = bag.predict(bag.data) good = 0 if algo == 'dtree': for i in range(len(predictions)): label = np.asarray(bag.data.to_float())[:, -1] if predictions[i, 1] == label[i]: good += 1 else: for i in range(len(predictions)): if predictions[i, 1] == bag.data.iloc[i, len(bag.data.iloc[0, :]) - 1]: good += 1 print(good / len(predictions))
def main(): #Error value processing if (ENABLE_VAL != 0 and ENABLE_VAL != 1): raise ValueError("ENABLE_VAL should be 0 or 1") if (ENABLE_GAIN != 0 and ENABLE_GAIN != 1): raise ValueError("ENABLE_GAIN should be 0 or 1") if (MAX_DEPTH < 0): raise ValueError("MAX_DEPTH should be nonnegative") elif (type(MAX_DEPTH) != int): raise TypeError("MAX_DEPTH should be an integer") #Read data path_name = DATA_PATH.rpartition('/') path = path_name[0] name = path_name[2] full_dataset = mldata.parse_c45(name, path) #Build tree and output all results if (ENABLE_VAL == 1): tree = build_tree.build_DecisionTree(MAX_DEPTH, EPS, full_dataset, ENABLE_GAIN) size = tree.get_tree_size() max_depth = tree.get_tree_depth() first_feature_index = tree.get_root().get_attriIndex() first_feature = full_dataset.schema.features[first_feature_index].name acc = tree.classify_dataset(full_dataset) print( 'Accuracy: %.4f\n\nSize: %d\n\nMaximum Depth: %d\n\nFirst Feature: %s' % (acc, size, max_depth, first_feature)) elif (ENABLE_VAL == 0): datasets = fold_5_cv(full_dataset) trees, sizes, first_features, accs, max_depths = build_trees(datasets) acc_sum = 0 for i in range(5): acc_sum += accs[i] acc = acc_sum / 5 print('\nAverage Accuracy: %.4f' % acc)
if (np.dot(self.w, val[index, 1:-1]) + self.b) > 0: pred[index][1] = 1 else: pred[index][1] = 0 #print(pred) return pred.astype(int) def ensemblePrediction(self, data): prediction = self.predict(data) return prediction[:, 1] if __name__ == '__main__': path = '../voting' data = utils._convert_exampleset_to_dataframe( mldata.parse_c45(path.split('/')[-1], path)) logreg = LogisticRegression(data, constant=0, weights=None) print('Final Weights', logreg.w) ''' parser = argparse.ArgumentParser(description='Logistic Regression Implementation') parser.add_argument('options', nargs=3, help="The options as specified by the prompt.") args = parser.parse_args() path = str(args.options[0]) if(os.path.isdir(path)): file_base = next(el for el in reversed(path.split('/')) if el) exampleSet = mldata.parse_c45(file_base, path) schema = exampleSet.schema print("Loading dataset:", file_base) else: assert 'Dataset input not found!'
''' path = '../spam' x = mldata.parse_c45(path.split('/')[-1], path) dtree(x, validationType=0, depth=5, splitCriterion=1) ''' parser = argparse.ArgumentParser( description='ID3 Decision Tree Implementation') parser.add_argument('options', nargs=4, help="The options as specified by the prompt.") args = parser.parse_args() path = str(args.options[0]) if (os.path.isdir(path)): exampleSet = mldata.parse_c45( next(el for el in reversed(path.split('/')) if el), path) print("Loading dataset:", next(el for el in reversed(path.split('/')) if el)) else: assert 'Dataset input not found!' xval = int(args.options[1]) if (xval == 0): print("Cross Validation enabled") elif (xval == 1): print("Cross Validation disabled") else: assert 'Unable to determine cross validation flag.' maxdepth = int(args.options[2]) if (maxdepth > 0):
def parse(filepath): return exset = parse_c45(filepath)
return [[classifier], [1.0]] # perfect classifier, or complete crap #correct = np.equal(pred[:,1], truth) # rounded (0, 1) predictions classifier_weights.append((1/2) * np.log((1-error)/error)) truth_scale = (truth * 2) - 1 pred_scale = (pred * 2) - 1 #update the weights next_weight = data_weights[-1] * np.exp(classifier_weights[-1] * np.multiply(truth_scale, pred_scale)) next_weight /= np.sum(next_weight) data_weights.append(next_weight) #print(len(classifiers), len(classifier_weights), len(data_weights[:-1])) #weights = data_weights[:-1] return classifiers, classifier_weights #remove the last weight def squared_error(self, weight, pred, truth): #error = 0 error = np.sum(np.multiply(weight, np.power(np.subtract(pred, truth), 2)), axis=0) # need to figure out axis #for idx, entry in enumerate(pred): # error += (pred[idx] - truth[idx])^2 return error if __name__ == '__main__': path = '../voting' data = utils._convert_exampleset_to_dataframe(mldata.parse_c45(path.split('/')[-1], path)) booster = boosting(path, data, 'logreg', 2) out = booster.predict(data)
for i in range(len(attrs)): for j in range(len(attrs)): if (i != j and attrs[i][0] == attrs[j][0]): split1 = attrs[i][1] split2 = attrs[j][1] attrs.remove(attrs[j]) attrs[i][1] = [ next(elem for elem in split1 if elem is not None), next(elem for elem in split2 if elem is not None) ] attrs[i][1].sort() return attrs return attrs if __name__ == '__main__': # This code is for testing purposes. x = mldata.parse_c45('voting', '../voting') #e = ns.EntropySelector(x) #dtree = build_tree(x, e, 0) #print(x[0].to_float()) #print(dtree.eval(x[0]).attr_float) #print(x[3]) #print(dtree.eval(x[3]).attr_float) #print(_combine_terms([[1,4.5],[3,(None,1234)], [6, "AY"], [3,(54,None)], [4, 4.0]])) #print(e.get_split_attr({2:4.0}, 0)) #attr_idx = -1 #attr_idx, attr_float = e.get_split_attr({2:4.0}, 0) #print(attr_idx, attr_float) #node = _init_node(x, None, attr_idx, attr_float, attr_idx)
pos_mu, neg_mu = summary['pos_mean'], summary['neg_mean'] pos_sig2, neg_sig2 = summary['pos_variance'], summary['neg_variance'] prob_pos = 1 / (2 * pi * pos_sig2)**0.5 * exp(-0.5 * (feature_value - pos_mu)**2 / pos_sig2) prob_neg = 1 / (2 * pi * neg_sig2)**0.5 * exp(-0.5 * (feature_value - neg_mu)**2 / neg_sig2) return prob_pos, prob_neg def get_smoothing_estimate(self, number_of_values): """ Returns a Laplace smoothing estimate if m_estimate is negative :param number_of_values: :return: """ if self.m_estimate < 0: return number_of_values else: return self.m_estimate if __name__ == '__main__': parser = argparse.ArgumentParser(description="A Naive-Bayes Classifier Implementation.") parser.add_argument('data_file_name') parser.add_argument('m_estimate', type=float) args = parser.parse_args() example_set = parse_c45(args.data_file_name, DATA_DIRECTORY) data_set = np.array(example_set.to_float()) for feature in example_set.schema[1:-1]: if feature.type == 'NOMINAL': feature.values = tuple([feature.to_float(value) for value in feature.values]) normalize(data_set, example_set.schema) results = NaiveBayes.solve(data_set, example_set.schema[1:-1], args.m_estimate) print_performance(results)
def main(): #Error value processing if (ENABLE_VAL != 0 and ENABLE_VAL != 1): raise ValueError("ENABLE_VAL should be 0 or 1") if (NUM_BINS < 2): raise ValueError("NUM_BINS should be greater that 2") elif (type(NUM_BINS) != int): raise TypeError("NUM_BINS should be an integer") #Read data path_name = DATA_PATH.rpartition('/') path = path_name[0] name = path_name[2] full_dataset = mldata.parse_c45(name, path) #Calculate the min and man values of each attribute, in order to decide the boundaries of k-bins min_and_max = [] np_full_dataset = np.array(full_dataset) attr_length = len(full_dataset.schema) - 2 min_and_max = np.zeros((attr_length, 2)) for i in range(1, attr_length + 1): if (full_dataset.schema[i].type == "CONTINUOUS"): row = np_full_dataset[:, i].astype(float) max = np.amax(row) min = np.amin(row) min_and_max[i - 1][0] = min min_and_max[i - 1][1] = max min_and_max = np.transpose(min_and_max) #Build models if (ENABLE_VAL == 1): label_ratio, save_all_prob, save_all_threshold = Naive_Bayes.showme_dataset( full_dataset, NUM_BINS, M, min_and_max) accuracy, precision, recall = compute_test_results( label_ratio, save_all_prob, full_dataset) ROC_area = compute_ROC_area() print( "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n" % (accuracy, precision, recall, ROC_area)) elif (ENABLE_VAL == 0): datasets = fold_5_cv(full_dataset) accuracies, precisions, recalls = naive_bayes_cv(datasets, min_and_max) avg_accuracy = 0 avg_precision = 0 avg_recall = 0 std_accuracy = 0 std_precision = 0 std_recall = 0 for i in range(5): avg_accuracy += accuracies[i] avg_precision += precisions[i] avg_recall += recalls[i] avg_accuracy = avg_accuracy / 5 avg_precision = avg_precision / 5 avg_recall = avg_recall / 5 for i in range(5): std_accuracy += (accuracies[i] - avg_accuracy)**2 std_precision += (precisions[i] - avg_precision)**2 std_recall += (recalls[i] - avg_recall)**2 std_accuracy = (std_accuracy / 5)**0.5 std_precision = (std_precision / 5)**0.5 std_recall = (std_recall / 5)**0.5 ROC_area = compute_ROC_area() print( "Accuracy: %.3f %.3f\nPrecision: %.3f %.3f\nRecall: %.3f %.3f\nArea under ROC: %.3f\n" % (avg_accuracy, std_accuracy, avg_precision, std_precision, avg_recall, std_recall, ROC_area))
def calcAve(ar): total = 0 for i in range(ar.shape[0]): total = total + ar[i] return total / ar.shape[0] # In[ ]: path = input('Enter the path to the data:') cv = int(input('Cross Validation? 0 for cv, 1 for full sample')) numbin = int(input('Enter the number of bins for any continuous feature:')) mvalue = int(input('Enter the value of m for the m-estimate:')) print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') data = np.array(parse_c45(path).to_float()) acc = np.array([]) prec = np.array([]) recall = np.array([]) roc = np.array([]) if cv == 1: Bayes = naiveBayes(mvalue, numbin, data) pred = Bayes[0] confi = Bayes[1] acc = np.append(acc, calcAccPreRec(pred, data)[0]) prec = np.append(prec, calcAccPreRec(pred, data)[1]) recall = np.append(recall, calcAccPreRec(pred, data)[2]) roc = np.append(roc, rocArea(confi, data[:, -1])) else: cvdata = stratCrossValid(data) for i in range(5):
def read_data(path): pathArray = path.split('\\') fileName = pathArray[len(pathArray) - 1] return mldata.parse_c45(fileName, path)
def boost(path, option, solver_type, num_iters): path = path.replace("\\", "/") file_base = path.split('/')[-1] rootdir=path epsilon_thread = 0.00000001 data = mldata.parse_c45(file_base, rootdir) n_bin = 1 cross_validation = False if option == 0: n_bin = 5 cross_validation = True data = np.asarray(data.to_float()) X_data = data[:, 1:-1] X_data = preprocess.process(X_data, file_base, n_bin) y_data = data[:, -1].astype(int) # print(len(X_data)) # partition the data into multiple dataset, folds = util.n_fold(len(data), n_bin) # nbayes: posi_num = [{} for i in range(len(X_data[0]))] nega_num = [{} for i in range(len(X_data[0]))] for i, d in enumerate(posi_num): for attr in np.unique(X_data[:, i]): posi_num[i][attr] = 0 for i, d in enumerate(nega_num): for attr in np.unique(X_data[:, i]): nega_num[i][attr] = 0 AUC_y = [] pred_AUC_y = [] acc = [] prec = [] rec = [] # training and evaluating for i in range(n_bin): if solver_type == "dtree": tree = ID3DecisionTree(1, path, "gain", cross_validation) x_train, y_train, x_test, y_test = tree.create_for_train(n_bin, i) train_size = len(x_train) wboost = np.ones((train_size, 1)).astype(float)/train_size alphas = [] epsilons = [] forest = [] for iter_ in range(num_iters): tree = ID3DecisionTree(1, path, "gain", cross_validation) x_train, y_train, x_test, y_test = tree.create_for_train(n_bin, i) D_train = (x_train, y_train) wboost, epsilon, alpha = tree.boosttrain(D_train, wboost, epsilon_thread) forest.append(tree) epsilons.append(epsilon) if epsilon == 0: alphas = [0] * len(alphas) alphas.append(1) break elif epsilon <= epsilon_thread or epsilon >= 0.5: alphas.append(alpha) break else: alphas.append(alpha) # y_pred = tree.test(x_test) result = [] for i in range(len(forest)): y_predB = forest[i].test(x_test) y_pred = np.array(y_predB) y_pred[y_pred==False] = 0 y_pred[y_pred==True] = 1 result.append(y_pred) alphas = np.array(alphas) alphas = alphas/np.sum(alphas) y_pred = alphas.dot(np.array(result)) y_pred[y_pred<0.5] = 0 y_pred[y_pred>=0.5] = 1 y_test = np.array(y_test) y_test[y_test<0.5] = 0 y_test[y_test>=0.5] = 1 AUC_y.extend(y_test) pred_AUC_y.extend(y_pred) _acc, _prec, _rec = util.cal_APR(y_pred, y_test) if cross_validation: util.report_cross(_acc, _prec, _rec) acc.append(_acc) prec.append(_prec) rec.append(_rec) elif solver_type == "nbayes": m_etimate = 0.1 x_train, y_train, x_test, y_test = create_for_train(X_data, y_data, folds, n_bin, i) train_size = len(x_train) wboost = np.ones((train_size, 1)).astype(float)/train_size alphas = [] epsilons = [] pre_ps = [] posi_ps = [] nega_ps = [] for iter_ in range(num_iters): pre_p, posi_p, nega_p, epsilon, alpha, wboost = nbayes.boosttrain_bayes(x_train, y_train, m_etimate, posi_num, nega_num, wboost, epsilon_thread) epsilons.append(epsilon) pre_ps.append(pre_p) posi_ps.append(posi_p) nega_ps.append(nega_p) if epsilon == 0: alphas = [0] * len(alphas) alphas.append(1) break elif epsilon <= epsilon_thread or epsilon >= 0.5: alphas.append(alpha) break else: alphas.append(alpha) result = [] for i in range(len(pre_ps)): y_predB = nbayes.pred(x_test, pre_ps[i], posi_ps[i], nega_ps[i]) y_pred = [] for i in y_predB: if i[0] > i[1]: y_pred.append(0) else: y_pred.append(1) y_pred = np.array(y_pred) result.append(y_pred) alphas = np.array(alphas) alphas = alphas/np.sum(alphas) y_pred = alphas.dot(np.array(result)) y_pred[y_pred<0.5] = 0 y_pred[y_pred>=0.5] = 1 AUC_y.extend(y_test) pred_AUC_y.extend(y_pred) _acc, _prec, _rec = util.cal_APR(y_pred, y_test) if cross_validation: util.report_cross(_acc, _prec, _rec) acc.append(_acc) prec.append(_prec) rec.append(_rec) elif solver_type == "logreg": # train x_train, y_train, x_test, y_test = create_for_train(X_data, y_data, folds, n_bin, i) train_size = len(x_train) wboost = np.ones((train_size, 1)).astype(float)/train_size alphas = [] epsilons = [] weights = [] for iter_ in range(num_iters): weight, epsilon, alpha, wboost = logreg.boostLR(x_train, y_train, wboost, epsilon_thread, max_iters=500, lbd=0.1) epsilons.append(epsilon) weights.append(weight) if epsilon == 0: alphas = [0] * len(alphas) alphas.append(1) break elif epsilon <= epsilon_thread or epsilon >= 0.5: alphas.append(alpha) break else: alphas.append(alpha) result = [] for i in range(len(weights)): result.append(logreg.pred(x_test, weights[i])) alphas = np.array(alphas) alphas = alphas/np.sum(alphas) y_pred = alphas.dot(np.array(result)) y_pred[y_pred<0.5] = 0 y_pred[y_pred>=0.5] = 1 AUC_y.extend(y_test) pred_AUC_y.extend(y_pred) _acc, _prec, _rec = logreg.cal_LR_APR(y_pred, y_test) if cross_validation: util.report_cross(_acc, _prec, _rec) acc.append(_acc) prec.append(_prec) rec.append(_rec) else: return roc_score = logreg.cal_AUC(AUC_y, pred_AUC_y) util.report(acc, prec, rec, roc_score)
def _is_binary(self, type): pattern = re.compile("BINARY*") return bool(pattern.match(type)) def _is_class(self, type): pattern = re.compile("CLASS*") return bool(pattern.match(type)) class TypeError(Exception): pass class FunctionNotSupported(Exception): pass if __name__ == '__main__': # TESTING --- Delete for finished product import sys sys.path.append("..") import mldata x = mldata.parse_c45("spam", "../spam") e = EntropySelector(x) print(e.get_split_attr({6: 0.0}, 0))
def main(): #Error value processing if (ENABLE_VAL != 0 and ENABLE_VAL != 1): raise ValueError("ENABLE_VAL must be 0 or 1") if (ALGORITHM != 1 and ALGORITHM != 2 and ALGORITHM != 3): raise ValueError("ALGORITHM must be 1 or 2 or 3") if (ITER <= 0): raise ValueError("ITER must be positive") elif (type(ITER) != int): raise ValueError("ITER must be an integer") #Read data path_name = DATA_PATH.rpartition('/') path = path_name[0] name = path_name[2] full_dataset = mldata.parse_c45(name, path) #Build models if (ENABLE_VAL == 1): if (ALGORITHM == 1): weight = 1 / len(full_dataset) * np.ones(len(full_dataset)) weight = weight.reshape(-1, 1) alpha_list, label_list = build_tree_boosting.boosting( MAX_DEPTH, EPS, full_dataset, full_dataset, ENABLE_GAIN, ITER, weight) f_list = compute_f_list(alpha_list, label_list) accuracy, precision, recall = compute_test_results( full_dataset, f_list) ROC_area = compute_ROC_area() print( "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n" % (accuracy, precision, recall, ROC_area)) elif (ALGORITHM == 2): alpha_list, label_list = naive_gayes.naive_bayes( full_dataset, full_dataset, ITER, NUM_BINS, M) f_list = compute_f_list(alpha_list, label_list) accuracy, precision, recall = compute_test_results( full_dataset, f_list) ROC_area = compute_ROC_area() print( "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n" % (accuracy, precision, recall, ROC_area)) elif (ALGORITHM == 3): lg = logreg.Logistic_Regression(lambdaa=LAMBDA, training_data=full_dataset, iteration=1, learning_rate=LR, boosting=True) lg, alpha_list, label_list = update_lg(lg, full_dataset) f_list = compute_f_list(alpha_list, label_list) accuracy, precision, recall = compute_test_results( full_dataset, f_list) ROC_area = compute_ROC_area() print( "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n" % (accuracy, precision, recall, ROC_area)) elif (ENABLE_VAL == 0): datasets = fold_5_cv(full_dataset) accuracies, precisions, recalls = cv(datasets) avg_accuracy = 0 avg_precision = 0 avg_recall = 0 std_accuracy = 0 std_precision = 0 std_recall = 0 for i in range(5): avg_accuracy += accuracies[i] avg_precision += precisions[i] avg_recall += recalls[i] avg_accuracy = avg_accuracy / 5 avg_precision = avg_precision / 5 avg_recall = avg_recall / 5 for i in range(5): std_accuracy += (accuracies[i] - avg_accuracy)**2 std_precision += (precisions[i] - avg_precision)**2 std_recall += (recalls[i] - avg_recall)**2 std_accuracy = (std_accuracy / 5)**0.5 std_precision = (std_precision / 5)**0.5 std_recall = (std_recall / 5)**0.5 ROC_area = compute_ROC_area() print( "Accuracy: %.3f %.3f\nPrecision: %.3f %.3f\nRecall: %.3f %.3f\nArea under ROC: %.3f\n" % (avg_accuracy, std_accuracy, avg_precision, std_precision, avg_recall, std_recall, ROC_area))
def load_data(): path_name = DATA_PATH.rpartition('/') path = path_name[0] name = path_name[2] full_dataset = mldata.parse_c45(name, path) return ExampleSet(full_dataset)
# In[ ]: def calcAve(ar): total = 0 for i in range(ar.shape[0]): total = total + ar[i] return total / ar.shape[0] # In[ ]: path = input('Enter the path to the data:') cv = int(input('Cross Validation? 0 for cv, 1 for full sample')) lamda = int(input('Enter the value of lamda:')) print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') dataType = [] data = parse_c45(path) for i in data.schema: dataType.append(i.type) data = np.array(data.to_float()) for k in range(len(dataType)): if (dataType[k] == 'NOMINAL'): for j in range(data.shape[1]): data[j, k] += 1.0 logReg_Cross(data, lamda, cv)
from mldata import parse_c45 from math import log2 print(' ') dataset = parse_c45("example") print(dataset.schema[4].type) print(dataset.schema[4].values) print(dataset[1]) print(set(dataset[1])) # def entropy(p) # summary = 0 # #H(X)=-sum(px*log2(px)) # for px in p: # px = px/sum(p) # if p != 0: # summary += px * log2(px,2) # reture (summary* -1) # def informationGain(data, x=None) # #IG = H(y) - H(y|x) # summary = 0 # for i in x: # summary += sum(i)/sum(d)*entropy(i) # ig = entropy(data)- summary # reture ig
return prob_pos, prob_neg def get_smoothing_estimate(self, number_of_values): """ Returns a Laplace smoothing estimate if m_estimate is negative :param number_of_values: :return: """ if self.m_estimate < 0: return number_of_values else: return self.m_estimate if __name__ == '__main__': parser = argparse.ArgumentParser( description="A Naive-Bayes Classifier Implementation.") parser.add_argument('data_file_name') parser.add_argument('m_estimate', type=float) args = parser.parse_args() example_set = parse_c45(args.data_file_name, DATA_DIRECTORY) data_set = np.array(example_set.to_float()) for feature in example_set.schema[1:-1]: if feature.type == 'NOMINAL': feature.values = tuple( [feature.to_float(value) for value in feature.values]) normalize(data_set, example_set.schema) results = NaiveBayes.solve(data_set, example_set.schema[1:-1], args.m_estimate) print_performance(results)
import sys import numpy as np import pandas as pd os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import tensorflow as tf # CONFIGURE HYPERPARAMETERS np.random.seed(12345) tf.random.set_seed(12345) # Get command line argument for data location argument_list = sys.argv[1:] path = str(argument_list[0]) filename = os.path.basename(path) filedir = path.replace(filename, '') data = parse_c45(filename, filedir) # Define epsilon value and type of noise epsilon = float(argument_list[1]) noise_type = argument_list[2] # Convert c45 data to DataFrame and create folds unprocessed_df = data_to_dataframe(data) attr_dict = create_attr_dict(data.schema) df_whole, _ = process_data(unprocessed_df, attr_dict) folds = create_folds(df_whole) # Create a DataFrame to store important metrics metrics_df = pd.DataFrame(columns=['fold', 'accuracy', 'precision', 'recall']) metrics = []
#!/usr/bin/env python # coding: utf-8 # In[2]: import os import math import operator import numpy as np import random from mldata import parse_c45 data = np.array(parse_c45('voting').to_float()) data2 = np.array(parse_c45('spam').to_float()) data3 = np.array(parse_c45('volcanoes').to_float()) # In[1]: def stratCrossValid( data ): # stratified 5-fold-validation for both discrete and continuous cases subset0 = [] subset1 = [] fold1 = [] fold2 = [] fold3 = [] fold4 = [] fold5 = [] for i in range(0, len(data)): if 1.0 == data[i, -1]:
data = mldata.parse_c45(path.split('/')[-1], path) logreg = logreg(data, validationType=0, constant=0) print('Final Weights', logreg.w) ''' parser = argparse.ArgumentParser( description='Logistic Regression Implementation') parser.add_argument('options', nargs=3, help="The options as specified by the prompt.") args = parser.parse_args() path = str(args.options[0]) if (os.path.isdir(path)): file_base = next(el for el in reversed(path.split('/')) if el) exampleSet = mldata.parse_c45(file_base, path) schema = exampleSet.schema print("Loading dataset:", file_base) else: assert 'Dataset input not found!' xval = int(args.options[1]) if (xval == 0): print("Cross Validation enabled") elif (xval == 1): print("Cross Validation disabled") else: assert 'Unable to determine cross validation flag.' constant = int(args.options[2]) if (constant >= 0):