labels = jrs_io.load_labels(open(labels_path)) single_labels = list(set(reduce(lambda l1,l2: l1+l2, (ll for ll in labels)))) #training_labels = labels[:training_size] n = len(labels) print n," multi-labels sets loaded (",len(single_labels),"single labels:",single_labels,")..." print "Sample five labels:", labels[:5] print "------------------------------------------" print "Extracting label occurrence vectors" label2occurrences = dict( (label,label_ocur(labels, label)) for label in single_labels ) #print label2occurrences print "------------------------------------------" print "Loading features from file:", features_matrix_path f = open(features_matrix_path) features = jrs_io.load_data(f, cast_method = float, numrows = LOAD_MAX_ROWS) print "","loaded", len(features),"x",len(features[0]) print "------------------------------------------" print "Calculating and reporting to:", out_path fout = open(out_path, "w") for label in single_labels: print "","considering label:",label label_occurrences_vector = label2occurrences[label] indval_colix = [] for colix in xrange(len(features[0])): indval,precision,recall = INDICATOR(label_occurrences_vector, extract_col(features, colix)) indval_colix.append( (indval,colix,precision,recall) ) fout.write(str(label)+";\t")
print "Loading labels' file:", labels_path labels = jrs_io.load_labels(open(labels_path)) single_labels = list(set(reduce(lambda l1,l2: l1+l2, (ll for ll in labels)))) n = len(labels) print n," multi-labels sets loaded (",len(single_labels),"single labels:",single_labels,")..." print "------------------------------------------" print "Extracting label occurrence vectors" label2occurrences = dict( (label,label_ocur(labels, label)) for label in single_labels ) #print label2occurrences print "------------------------------------------" print "Loading features from file:", features_matrix_path f = open(features_matrix_path) features = jrs_io.load_data(f, cast_method = int, numrows = LOAD_MAX_ROWS) print "","loaded", len(features),"x",len(features[0]) print "------------------------------------------" print "Loading features-ind file",featuresind_path print "Loading negfeaturesind_path file",negfeaturesind_path f1 = open(featuresind_path) f2 = open(negfeaturesind_path) f1lines = f1.readlines() f2lines = f2.readlines() if len(f1lines)!=len(f2lines): print "ERROR. len IND != len NEG-IND" sys.exit(-1) for i, (line, line2) in enumerate(izip(f1lines, f2lines)): label = str(i+1)
print "Loading labels' file:", labels_path labels = jrs_io.load_labels(open(labels_path)) n = len(labels) print "",n," labels' sets loaded." order = range(n) random.shuffle(order) print "Random order:", order[:30],"..." print "Shuffling labels..." labels_shuffled = [labels[ix] for ix in order] jrs_io.store_labels(open(labels_path+"_shuffled","w"), labels_shuffled) print "Loading distances' file:", distance_matrix_path distances = jrs_io.load_data(open(distance_matrix_path), lambda x: x) try: print "",len(distances), "x",len(distances[0]) except: pass print "Extending order..." order = order + range(n, len(distances)) print "Extended order:", order print "Shuffling columns" distances_tmp = [] for row in distances: new_row = [ row[ix] for ix in order ] distances_tmp.append(new_row) print "Shuffling rows" distances_shuffled = []
print "Loading labels' file:", labels_path labels = jrs_io.load_labels(open(labels_path)) n = len(labels) print "", n, " labels' sets loaded." order = range(n) random.shuffle(order) print "Random order:", order[:30], "..." print "Shuffling labels..." labels_shuffled = [labels[ix] for ix in order] jrs_io.store_labels(open(labels_path + "_shuffled", "w"), labels_shuffled) print "Loading distances' file:", distance_matrix_path distances = jrs_io.load_data(open(distance_matrix_path), lambda x: x) try: print "", len(distances), "x", len(distances[0]) except: pass print "Extending order..." order = order + range(n, len(distances)) print "Extended order:", order print "Shuffling columns" distances_tmp = [] for row in distances: new_row = [row[ix] for ix in order] distances_tmp.append(new_row)
print " sample counts:", sorted(list(label2count.iteritems()))[:10] print " sample `friend`-labels:", sorted(list(label2size.iteritems()))[:10] print " sample pairlabel2count:", sorted(list( pairlabel2count.iteritems()))[:50] print "------------------------------------------" print "------------------------------------------" print "------------------------------------------" avg_label_count = float(sum(len(l) for l in labels)) / len(labels) print "Avg labels per object:", avg_label_count print "------------------------------------------" print "Loading distances' file:", distance_matrix_path distances = jrs_io.load_data(open(distance_matrix_path), cast_method, numrows=LOAD_ROWS_FROM_FILE) try: print "", len(distances), "x", len(distances[0]) except: pass #print "Sample distances:", distances[:5][:5] print "------------------------------------------" #KLASYFIKATOR Ensembled Strongest Fractional Knn print "Building Ensembled Strongest FractionKNN..." training_single_labels = list( set(reduce(lambda l1, l2: l1 + l2, (ll for ll in training_labels)))) print " training labels:", training_single_labels print " extracting submatrix..." training_distances = jrs_io.extract_submatrix(distances, training_range[0],
print "------------------------------------------" #print "Building tree using pairlabel2count & label2count" #outpath = "/tmp/pairlabel2dist_avg_tree" #pairwisecount_tree(label2count, pairlabel2count, outpath) #print "------------------------------------------" print "------------------------------------------" print "------------------------------------------" avg_label_count = float(sum(len(l) for l in labels)) / len(labels) print "Avg labels per object:",avg_label_count print "------------------------------------------" print "Loading distances' file:", distance_matrix_path distances = jrs_io.load_data(open(distance_matrix_path), cast_method, numrows = LOAD_ROWS_FROM_FILE) try: print "",len(distances), "x",len(distances[0]) except: pass #print "Sample distances:", distances[:5][:5] print "------------------------------------------" #print "Building tree on sample vs. sample distances avg" #outpath = "training_distances_avg_tree" #print " extracting submatrix..." #training_distances = jrs_io.extract_submatrix(distances, training_range[0], training_range[1], training_range[0], training_range[1]) #dist_tree(training_distances, training_labels, outpath) #dist_tree(distances, labels, outpath) #print "------------------------------------------" #print "------------------------------------------" #print "------------------------------------------"