def classify(self, sample, excluding = [], closest=3): """ Classify sample, findig 'closest' number of closest points and assigning their classification codes """ #print "sample", sample #print "excluding", excluding f = lambda x, y: dist(x, y, self.weights, self.shifts) best_elems = find_closest_points(sample, self.frecords, excluding, closest, f) #print "best_elems", best_elems return reduce(lambda x, y: x+y, map(lambda r: mc2lmc_tomka_blad(r), best_elems))
def classify(self, sample, excluding=[], closest=3): """ Classify sample, findig 'closest' number of closest points and assigning their classification codes """ #print "sample", sample #print "excluding", excluding f = lambda x, y: dist(x, y, self.weights, self.shifts) best_elems = find_closest_points(sample, self.frecords, excluding, closest, f) #print "best_elems", best_elems return reduce(lambda x, y: x + y, map(lambda r: mc2lmc_tomka_blad(r), best_elems))
mlknn_callable = lambda train_gen: mlknn.MlKnn(train_gen, zbldistance, find_closest_points.find_closest_points, k, smoothingparam) label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x) record_mappings = (lambda x: gen_1record_prefixed(x, 2), lambda x: gen_1record_prefixed(x, 3), lambda x: x) hierarhical_mlknn = ml_hierarchical.MlHierarchical(train_generator, mlknn_callable, label_mappings, record_mappings) from tools.pickle_tools import save_pickle save_pickle(hierarhical_mlknn.mltree.content, save_hierarchical_path+"mlknn") save_pickle(hierarhical_mlknn, save_hierarchical_path) save_pickle(list(train_generator()), save_train_generator_path) save_pickle(len(labels), save_lenlabels_path) classify_oracle = lambda x: mc2lmc_tomka_blad(x) print "----------------------------------------------------" print "MLKNN:" print "PRINTING TEST SAMPLES:" for i in test_generator(): print classify_oracle(i) multilabel_evaluate_printresults(test_generator, classify_oracle, hierarhical_mlknn.classify, len(labels), {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]}) #print "----------------------------------------------------" #print "STUPID KNN:" #multilabel_evaluate_printresults(test_generator, classify_oracle, hierarhical_mlknn.classify_stupid, len(labels), # #{'full label': lambda x: x, 'short label': lambda x: x[:1]}) # {'full label': lambda x: x, 'half label': lambda x: x[:3], 'low label': lambda x: x[:2]})
if len(sys.argv) < 5: PRINTER("Not enough of argument!") exit(1) load_train_generator_path = sys.argv[1] load_labels_path = sys.argv[2] load_elements_count_path = sys.argv[3] save_classifier_path = sys.argv[4] PRINTER("Input arguments:") PRINTER("load_train_generator_path: " + str(load_train_generator_path)) PRINTER("load_labels_path: " + str(load_labels_path)) PRINTER("load_elements_count_path: " + str(load_elements_count_path)) PRINTER("save_classifier_path: " + str(save_classifier_path)) from tools.pickle_tools import read_pickle train_generator_list = read_pickle(load_train_generator_path) lenlabels = len(read_pickle(load_labels_path)) elements_count = read_pickle(load_elements_count_path) train_generator = lambda: train_generator_list get_labels_of_record = mc2lmc_tomka_blad classify_oracle = lambda x: mc2lmc_tomka_blad(x) random_classif = WeightedRandomLabelClassifier(train_generator, get_labels_of_record, classify_oracle) from tools.pickle_tools import save_pickle save_pickle(random_classif, save_classifier_path)
from __future__ import division import random import sys sys.path.append(r'../') from data_io.zbl_record_generators import mc2lmc_tomka_blad def PRINTER(x): #pass import logging logging.info(x) #print x# classify_oracle = lambda x: mc2lmc_tomka_blad( x) #because function assigned outside from a class doesn't pickle! class RandomLabelClassifier(object): ''' Assign a random subset of labels to a given sample. Takes use of the information on how many labels have been assigned by oracle. ''' def __init__(self, frecords, get_labels_of_record, find_all_labels, classify_oracle): ''' Constructor. @type frecords: generator @param frecords: generator returning records, is used to calculate parameters (probabilities)
Classifier returning a random label assuming a uniform distribution. ''' from __future__ import division import random import sys sys.path.append(r'../') from data_io.zbl_record_generators import mc2lmc_tomka_blad def PRINTER(x): #pass import logging logging.info(x) #print x# classify_oracle = lambda x: mc2lmc_tomka_blad(x) #because function assigned outside from a class doesn't pickle! class RandomLabelClassifier(object): ''' Assign a random subset of labels to a given sample. Takes use of the information on how many labels have been assigned by oracle. ''' def __init__(self, frecords, get_labels_of_record, find_all_labels, classify_oracle): ''' Constructor. @type frecords: generator @param frecords: generator returning records, is used to calculate parameters (probabilities)
def get_posterior_probabilities_quicktrain(self, min_label_occurence): ''' Computing the posterior probabilities P (Ej |Hb ). Training only on part of the data set, so that each label occurs at least min_label_occurence. #todo: do shuffling when choosing elements. ''' from collections import defaultdict #preperation c = {} c_prim = {} labels_cnt = {} for label in self.labels: c[label] = {} c_prim[label] = {} for i in xrange(self.k + 1): #number of elements of a given label which have i neighbours of a given label c[label][i] = 0 c_prim[label][i] = 0 labels_cnt[label] = 0 #for each record compute elem_cnt = 0 #todel for r in self.frecords(): #if all the labels occur minimum no of times, break: if len(labels_cnt) == 0: break labels_codes = mc2lmc_tomka_blad(r) elem_cnt += 1 #todel if elem_cnt % 100 == 1: #todel print elem_cnt #todel print "labels_cnt:", labels_cnt #check if this records brings some improve, and update is_important = False for l in labels_codes: if l in labels_cnt: is_important = True labels_cnt[l] += 1 if labels_cnt[l] >= min_label_occurence: labels_cnt.pop(l) if is_important: d = defaultdict(lambda: 0) for code in self.classify_stupid(r): d[code] += 1 for code in self.labels: if code in labels_codes: c[code][d[code]] += 1 else: c_prim[code][d[code]] += 1 #compute the final values: peh = {} for code in self.labels: peh[code] = {} for i in xrange(self.k + 1): peh[code][i] = {} for code in self.labels: sum_c = sum(c[code].itervalues()) sum_c_prim = sum(c_prim[code].itervalues()) for i in xrange(self.k + 1): peh[code][i][True] = (self.smoothing_param + c[code][i]) / (self.smoothing_param * (self.k + 1) + sum_c) peh[code][i][False] = (self.smoothing_param + c_prim[code][i] ) / (self.smoothing_param * (self.k + 1) + sum_c_prim) return peh
def get_posterior_probabilities_quicktrain(self, min_label_occurence): ''' Computing the posterior probabilities P (Ej |Hb ). Training only on part of the data set, so that each label occurs at least min_label_occurence. #todo: do shuffling when choosing elements. ''' from collections import defaultdict #preperation c = {} c_prim = {} labels_cnt = {} for label in self.labels: c[label] = {} c_prim[label] = {} for i in xrange(self.k+1): #number of elements of a given label which have i neighbours of a given label c[label][i] = 0 c_prim[label][i] = 0 labels_cnt[label] = 0 #for each record compute elem_cnt = 0#todel for r in self.frecords(): #if all the labels occur minimum no of times, break: if len(labels_cnt) == 0: break labels_codes = mc2lmc_tomka_blad(r) elem_cnt+=1#todel if elem_cnt%100 == 1:#todel print elem_cnt#todel print "labels_cnt:", labels_cnt #check if this records brings some improve, and update is_important = False for l in labels_codes: if l in labels_cnt: is_important = True labels_cnt[l] += 1 if labels_cnt[l] >= min_label_occurence: labels_cnt.pop(l) if is_important: d = defaultdict(lambda: 0) for code in self.classify_stupid(r): d[code]+=1 for code in self.labels: if code in labels_codes: c[code][d[code]] += 1 else: c_prim[code][d[code]] += 1 #compute the final values: peh = {} for code in self.labels: peh[code] = {} for i in xrange(self.k+1): peh[code][i] = {} for code in self.labels: sum_c = sum(c[code].itervalues()) sum_c_prim = sum(c_prim[code].itervalues()) for i in xrange(self.k+1): peh[code][i][True] = (self.smoothing_param + c[code][i])/(self.smoothing_param * (self.k + 1) + sum_c) peh[code][i][False] = (self.smoothing_param + c_prim[code][i])/(self.smoothing_param * (self.k + 1) + sum_c_prim) return peh