def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: # Make sure they use --meta-iters if they want to do bagging/boosting raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() # import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print(options) classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y) train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print(' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print(' Precision: %.03f %.03f' % stats_manager.get_statistic('precision', pooled=False)) print(' Recall: %.03f %.03f' % stats_manager.get_statistic('recall', pooled=False)) print('Area under ROC: %.03f' % stats_manager.get_statistic('auc', pooled=True))
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y, schema) train_time = (train_start - time.time()) print train_time if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X,schema) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:,1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print (' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print (' Precision: %.03f %.03f' % stats_manager.get_statistic('precision', pooled=False)) print (' Recall: %.03f %.03f' % stats_manager.get_statistic('recall', pooled=False)) print ('Area under ROC: %.03f' % stats_manager.get_statistic('auc', pooled=True))
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting. """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() pool = mp.Pool(k) # one process per fold # CPU gogogo results = pool.map(train_and_evaluate, [(fold, options) for fold in folds]) for test_y, predictions, scores, train_time in results: stats_manager.add_fold(test_y, predictions, scores, train_time) accuracy, std_dev = stats_manager.get_statistic( 'accuracy', pooled=False, ) print((' Accuracy: %.03f %.03f' % (accuracy, std_dev))) precision, std_dev = stats_manager.get_statistic( 'precision', pooled=False, ) print((' Precision: %.03f %.03f' % (precision, std_dev))) recall, std_dev = stats_manager.get_statistic( 'recall', pooled=False, ) print((' Recall: %.03f %.03f' % (recall, std_dev))) area = stats_manager.get_statistic('auc', pooled=True) print(('Area under ROC: %.03f' % area))
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') #MAX_DEPTH = options.pop('depth') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) attr_set=[] for i in range(len(schema.feature_names)): attr_set.append(schema.is_nominal(i)) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y, attr_set) print 'ff' train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions=[] for t in test_X: predictions.append(classifier.predict(t)) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:,1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print classifier.size() print classifier.depth() print (' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) '''
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end maxSize = -1 maxDepth = -1 for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) classifier.schema = schema # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) #Note that I changed fit to take in the schema classifier.fit(train_X, train_y, schema) train_time = (train_start - time.time()) #To see the values and confidences of the root node #for attrVal, child in classifier.treeHead.children.iteritems(): # print "%d with confidence %f" % (attrVal, child.classLabelConfidence) #Maintennce to keep track of the maxSize and maxDepth if classifier.size > maxSize: maxSize = classifier.size if classifier.depth > maxDepth: maxDepth = classifier.depth #For my testing purposes, I had printed out the train_time #print "train time: %f" % train_time #For spam and voting tests, I printed out the root attribute #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute]) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) #The printouts specified by the assignments print('\tAccuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print "\tMaximum Size: %d" % maxSize print "\tMaximum Depth: %d" % maxDepth
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end maxSize = -1 maxDepth = -1 for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) classifier.schema = schema # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) #Note that I changed fit to take in the schema classifier.fit(train_X, train_y, schema) train_time = (train_start - time.time()) #To see the values and confidences of the root node #for attrVal, child in classifier.treeHead.children.iteritems(): # print "%d with confidence %f" % (attrVal, child.classLabelConfidence) #Maintennce to keep track of the maxSize and maxDepth if classifier.size > maxSize: maxSize = classifier.size if classifier.depth > maxDepth: maxDepth = classifier.depth #For my testing purposes, I had printed out the train_time #print "train time: %f" % train_time #For spam and voting tests, I printed out the root attribute #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute]) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:,1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) #The printouts specified by the assignments print ('\tAccuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print "\tMaximum Size: %d" % maxSize print "\tMaximum Depth: %d" % maxDepth
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') #MAX_DEPTH = options.pop('depth') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) attr_set = [] for i in range(len(schema.feature_names)): attr_set.append(schema.is_nominal(i)) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y, attr_set) print 'ff' train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions = [] for t in test_X: predictions.append(classifier.predict(t)) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print classifier.size() print classifier.depth() print(' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) '''
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: # Make sure they use --meta-iters if they want to do bagging/boosting raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) # put schema in the options so the classifier has it options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() sizes = [] depths = [] for train_X, train_y, test_X, test_y in folds: # Construct classifier instance classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y) first_test = classifier.root.feature print_str = schema.feature_names[first_test] if classifier.root.split: print_str += " <= %f" % classifier.root.split # Print the first test of each learned classifier print "First test: %s" % print_str train_time = (train_start - time.time()) sizes.append(classifier.size()) depths.append(classifier.depth()) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print(' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print(' Average Size: %.03f' % np.mean(sizes)) print(' Average Depth: %.03f' % np.mean(depths)) '''