k = map(int, k.strip().split(',')) PRINTER("loaded k-list: " + str(k)) from mlknn import mlknn_tensembled mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled( train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, get_labels_of_record_arg, lambda x: 1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) PRINTER("Time taken for training:" + str(start - time())) PRINTER("------------------------") PRINTER("---Testing classifier---") PRINTER("------------------------") test_generator = read_pickle(load_test_generator) labels = read_pickle(load_labels_path) classify_oracle = mc2lmc_tomka_blad from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])], labels)
def main(train_generator_list, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator): PRINTER("Finding label list...") get_labels_of_record = mc2lmc_tomka_blad find_all_labels = lambda frecords: get_labels_min_occurence(lambda: gen_lmc(frecords), 1) PRINTER("Loading distance matrix...") import sys sys.path.append(r'../') from data_io.matrix_io import fread_smatrix (rows, cols, data) = fread_smatrix(distancematrix) id2rowind, id2colind = {}, {} for ind, id in enumerate(rows): id2rowind[id] = ind for ind, id in enumerate(cols): id2colind[id] = ind #print "len(train_generator_list):",len(train_generator_list) #print "len(test_generator_list):",len(test_generator) #print "len(rows):",len(rows) #print "(rows, cols, data):", (rows, cols, data) PRINTER("Training classifier...") from time import time def printer(x): #import logging logging.info('['+classifier_name+']'+x) def distance(a, b): try: return data[id2rowind[a['an']]][id2colind[b['an']]] except: return data[id2colind[b['an']]][id2rowind[a['an']]] start = time() if classifier_name=='mlknn_basic': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = int(k) from mlknn import mlknn_basic classifier = mlknn_basic.MlknnBasic(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer) elif classifier_name == 'mlknn_threshold': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = int(k) from mlknn import mlknn_threshold classifier = mlknn_threshold.MlknnThreshold(train_generator_list, get_neighbours, k, smoothing_param, get_labels_of_record, lambda x:1, printer) elif classifier_name == 'mlknn_tensembled': def get_neighbours(sample, k): return find_closest_points_sorted(sample, train_generator_list, [sample], k, distance) k = map(int, k.strip().split(',')) PRINTER("loaded k-list: "+str(k)) from mlknn import mlknn_tensembled classifier = mlknn_tensembled.MlknnTEnsembled(train_generator_list, get_neighbours, k, get_labels_of_record, lambda x:1, printer) elif classifier_name=='mlknn-basic-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = int(k) from mlknn import mlknn_basic mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_basic.MlknnBasic(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) elif classifier_name == 'mlknn-threshold-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = int(k) from mlknn import mlknn_threshold mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_threshold.MlknnThreshold(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) elif classifier_name == 'mlknn-tensembled-tree': def get_neighbours(sample, k, train_gen): return find_closest_points_sorted(sample, train_gen, [sample], k, distance) k = map(int, k.strip().split(',')) PRINTER("loaded k-list: "+str(k)) from mlknn import mlknn_tensembled mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) PRINTER("Time taken for training:"+str(start-time())) PRINTER("------------------------") PRINTER("---Testing classifier---") PRINTER("------------------------") classify_oracle = mc2lmc_tomka_blad from mltools.multilabel_evaluate import multilabel_evaluate, multilabel_evaluate_printresults accuracy, precision, recall, hammingloss, subset01loss, fmeasure = multilabel_evaluate(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])]) PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults(accuracy, precision, recall, hammingloss, subset01loss, fmeasure, PRINTER) return accuracy, precision, recall, hammingloss, subset01loss, fmeasure
PRINTERMAIN("save_elements_count_path: "+save_elements_count_path) PRINTERMAIN("filtered_by: "+str(filtered_by)) """ from main_train_classifier_distmat import main #curr_accuracy, curr_precision, curr_recall, curr_hammingloss, curr_subset01loss, curr_fmeasure measures = [[] for _ in xrange(6)]#6 measures for train_generator, test_generator, elements_count, labels, elements_count in gen_train_test_kfold(fname, codeprefixlen, mincodeoccurences, filtered_by, kfold): from choose_best_k import evaluate_k_kfold train_elements_count = len(train_generator) k_evaluation = evaluate_k_kfold(labels, labelsset, lambda: train_generator, train_elements_count, classifier_name, k, smoothing_param, distancematrix, kfold) #sys.exit(1) sub_measures = main(train_generator, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator) for i, sub_m in enumerate(sub_measures): measures[i].append(sub_m) #summarize, each : final_measures = [{} for _ in xrange(6)] for ind, final_measure in enumerate(final_measures): for key in measures[0][0].keys(): final_measure[key] = 0 for key in measures[0][0].keys(): for measure in measures[ind]: final_measure[key] += measure[key] for key in measures[0][0].keys(): final_measure[key] /= len(measures[0]) from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTERMAIN("---FINAL RESULTS---") def PRINTER_PARAM(x): print x multilabel_evaluate_printresults(*final_measures)
kfold = int(sys.argv[8]) except: print '8th argument: how many folds.' sys.exit(1) try: filtered_by = sys.argv[9:] except: print '8th argument: field names which have to occur for the record to be considered.' sys.exit(1) """ PRINTERMAIN("Input arguments:") PRINTERMAIN("fname: "+fname) PRINTERMAIN("codeprefixlen: "+str(codeprefixlen)) PRINTERMAIN("mincodeoccurences: "+str(mincodeoccurences)) PRINTERMAIN("save_train_generator_path: "+save_train_generator_path) PRINTERMAIN("save_test_generator_path: "+save_test_generator_path) PRINTERMAIN("save_labels_path: "+save_labels_path) PRINTERMAIN("save_elements_count_path: "+save_elements_count_path) PRINTERMAIN("filtered_by: "+str(filtered_by)) """ labels, labelsset, prefix_code_generator, elements_count = load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by) from main_train_classifier_distmat import main #curr_accuracy, curr_precision, curr_recall, curr_hammingloss, curr_subset01loss, curr_fmeasure final_measures = evaluate_k_fold(classifier_name, k, smoothing_param, distancematrix, kfold, labels, labelsset, prefix_code_generator, elements_count) from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTERMAIN("---FINAL RESULTS---") def PRINTER_PARAM(x): print x multilabel_evaluate_printresults(*(final_measures+[PRINTER_PARAM]))
PRINTER("load_hierarchical_path: "+str(load_hierarchical_path)) PRINTER("load_train_generator: "+str(load_train_generator)) PRINTER("lenlabels_path: "+str(lenlabels_path)) log_level = logging.INFO logging.basicConfig(level=log_level) from tools.pickle_tools import read_pickle hierarhical_mlknn = read_pickle(load_hierarchical_path) test_generator = read_pickle(load_train_generator) lenlabels = read_pickle(lenlabels_path) #print "Finding out if the ML-hierarchical has internal data..." #check_internal_data(hierarhical_mlknn) print "----------------------------------------------------" #print "MLKNN:" #print "PRINTING TEST SAMPLES:" #for i in test_generator: # print classify_oracle(i) classify_oracle = lambda x: mc2lmc_tomka_blad(x) multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, hierarhical_mlknn.classify, lenlabels, {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]}) #print "----------------------------------------------------" #print "STUPID KNN:" #multilabel_evaluate_printresults(test_generator, classify_oracle, hierarhical_mlknn.classify_stupid, len(labels), # #{'full label': lambda x: x, 'short label': lambda x: x[:1]}) # {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]})
k, smoothing_param, get_labels_of_record_arg, lambda x:1, printer) elif classifier_name == 'mlknn-tensembled-tree': k = map(int, k.strip().split(',')) PRINTER("loaded k-list: "+str(k)) from mlknn import mlknn_tensembled mlknn_callable = lambda train_gen, get_labels_of_record_arg: mlknn_tensembled.MlknnTEnsembled(train_gen, lambda sample, k: get_neighbours(sample, k, train_gen), k, get_labels_of_record_arg, lambda x:1, printer) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) from mltools.ml_hierarchical import MlHierarchical classifier = MlHierarchical(train_generator_list, mlknn_callable, label_mappings, get_labels_of_record) PRINTER("Time taken for training:"+str(start-time())) PRINTER("------------------------") PRINTER("---Testing classifier---") PRINTER("------------------------") test_generator = read_pickle(load_test_generator) labels = read_pickle(load_labels_path) classify_oracle = mc2lmc_tomka_blad from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, classifier.__getattribute__('classify'), len(labels), [('full label', lambda x: x), ('half label', lambda x: x[:3]), ('low label', lambda x: x[:2])], labels)
train_elements_count = len(train_generator) k_evaluation = evaluate_k_kfold(labels, labelsset, lambda: train_generator, train_elements_count, classifier_name, k, smoothing_param, distancematrix, kfold) #sys.exit(1) sub_measures = main(train_generator, labels, elements_count, classifier_name, k, smoothing_param, distancematrix, test_generator) for i, sub_m in enumerate(sub_measures): measures[i].append(sub_m) #summarize, each : final_measures = [{} for _ in xrange(6)] for ind, final_measure in enumerate(final_measures): for key in measures[0][0].keys(): final_measure[key] = 0 for key in measures[0][0].keys(): for measure in measures[ind]: final_measure[key] += measure[key] for key in measures[0][0].keys(): final_measure[key] /= len(measures[0]) from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTERMAIN("---FINAL RESULTS---") def PRINTER_PARAM(x): print x multilabel_evaluate_printresults(*final_measures)
filtered_by = sys.argv[9:] except: print '8th argument: field names which have to occur for the record to be considered.' sys.exit(1) """ PRINTERMAIN("Input arguments:") PRINTERMAIN("fname: "+fname) PRINTERMAIN("codeprefixlen: "+str(codeprefixlen)) PRINTERMAIN("mincodeoccurences: "+str(mincodeoccurences)) PRINTERMAIN("save_train_generator_path: "+save_train_generator_path) PRINTERMAIN("save_test_generator_path: "+save_test_generator_path) PRINTERMAIN("save_labels_path: "+save_labels_path) PRINTERMAIN("save_elements_count_path: "+save_elements_count_path) PRINTERMAIN("filtered_by: "+str(filtered_by)) """ labels, labelsset, prefix_code_generator, elements_count = load_labels_codegen_elemcnt( fname, codeprefixlen, mincodeoccurences, filtered_by) from main_train_classifier_distmat import main #curr_accuracy, curr_precision, curr_recall, curr_hammingloss, curr_subset01loss, curr_fmeasure final_measures = evaluate_k_fold(classifier_name, k, smoothing_param, distancematrix, kfold, labels, labelsset, prefix_code_generator, elements_count) from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTERMAIN("---FINAL RESULTS---") def PRINTER_PARAM(x): print x multilabel_evaluate_printresults(*(final_measures + [PRINTER_PARAM]))
labels_path = sys.argv[3] except: print '3d argument expected: path to a pickled labels list.' sys.exit(1) try: classify_method_name = sys.argv[4] except: print '4th argument expected: classify method name.' sys.exit(1) #PRINTER("Input arguments:") #PRINTER("load_classifier_path: "+str(load_classifier_path)) #PRINTER("load_test_generator: "+str(load_test_generator)) #PRINTER("labels_path: "+str(labels_path)) #PRINTER("classify_method_name: "+str(classify_method_name)) from tools.pickle_tools import read_pickle classifier = read_pickle(load_classifier_path) test_generator = read_pickle(load_test_generator) labels = read_pickle(labels_path) #print "Finding out if the ML-hierarchical has internal data..." #check_internal_data(hierarhical_mlknn) classify_oracle = mc2lmc_tomka_blad #print "----------------------------------------------------" #print "Hierachical MLKNN:" PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, classifier.__getattribute__(classify_method_name), len(labels), {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]}, labels)
log_level = logging.INFO logging.basicConfig(level=log_level) from tools.pickle_tools import read_pickle hierarhical_mlknn = read_pickle(load_hierarchical_path) test_generator = read_pickle(load_train_generator) lenlabels = read_pickle(lenlabels_path) #print "Finding out if the ML-hierarchical has internal data..." #check_internal_data(hierarhical_mlknn) print "----------------------------------------------------" #print "MLKNN:" #print "PRINTING TEST SAMPLES:" #for i in test_generator: # print classify_oracle(i) classify_oracle = lambda x: mc2lmc_tomka_blad(x) multilabel_evaluate_printresults( lambda: test_generator, classify_oracle, hierarhical_mlknn.classify, lenlabels, { 'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2] }) #print "----------------------------------------------------" #print "STUPID KNN:" #multilabel_evaluate_printresults(test_generator, classify_oracle, hierarhical_mlknn.classify_stupid, len(labels), # #{'full label': lambda x: x, 'short label': lambda x: x[:1]}) # {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]})
label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) PRINTER("Training hierarchical mlknn...") from time import time start = time() hierarhical_mlknn = MlHierarchical(train_generator, mlknn_callable, label_mappings, get_labels_of_record) PRINTER("time taken for training:" + str(start - time())) PRINTER("Testing hierarchical mlknn fractional...") test_generator = read_pickle(load_test_generator) labels = read_pickle(load_labels_path) #print "Finding out if the ML-hierarchical has internal data..." #check_internal_data(hierarhical_mlknn) classify_oracle = mc2lmc_tomka_blad #print "----------------------------------------------------" #print "Hierachical MLKNN:" from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults( lambda: test_generator, classify_oracle, hierarhical_mlknn.__getattribute__('classify'), len(labels), { 'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2] }, labels) #from tools.pickle_tools import save_pickle #save_pickle(hierarhical_mlknn, save_classifier_path)
get_labels_of_record = mc2lmc_tomka_blad mlknn_callable = lambda train_gen, get_labels_of_record_arg: MlKnn(train_gen, zbldistance, find_closest_points, k, smoothingparam, get_labels_of_record_arg) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) PRINTER("Training hierarchical mlknn...") from time import time start = time() hierarhical_mlknn = MlHierarchical(train_generator, mlknn_callable, label_mappings, get_labels_of_record) PRINTER("time taken for training:"+str(start-time())) PRINTER("Testing hierarchical mlknn...") test_generator = read_pickle(load_test_generator) labels = read_pickle(load_labels_path) #print "Finding out if the ML-hierarchical has internal data..." #check_internal_data(hierarhical_mlknn) classify_oracle = mc2lmc_tomka_blad #print "----------------------------------------------------" #print "Hierachical MLKNN:" from mltools.multilabel_evaluate import multilabel_evaluate_printresults PRINTER("-----------RESULTS-----------") multilabel_evaluate_printresults(lambda: test_generator, classify_oracle, hierarhical_mlknn.__getattribute__('classify'), len(labels), {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]}, labels) #from tools.pickle_tools import save_pickle #save_pickle(hierarhical_mlknn, save_classifier_path)