def find_best_beta(labels, features, cv, scorer, min_vals, max_vals): dim = len(features[0]) beta_set = [x/10.0 for x in range(1,10)] perfs = [[] for i in range(len(beta_set))] n = len(labels) positive_index = [i for i in range(n) if 1 == labels[i]] negative_index = [i for i in range(n) if -1 == labels[i]] num_positive = len(positive_index) num_negative = len(negative_index) if num_positive < cv: message = "[find_best_beta] %d positive examples, insufficient for %d-fold cross-validation" % (len(positive_index), cv) raise Exception(message) for folder in range(cv): print ("[find_best_beta] %d <- %s" % (folder, "-".join(map(str, [i for i in range(cv) if i!=folder])))) labels_val = [1 for i in range(num_positive) if i%cv == folder] + [-1 for i in range(num_negative) if i%cv == folder] features_val = ([features[positive_index[i]] for i in range(num_positive) if i%cv == folder] + [features[negative_index[i]] for i in range(num_negative) if i%cv == folder]) labels_train = [1 for i in range(num_positive) if i%cv != folder] + [-1 for i in range(num_negative) if i%cv != folder] features_train = ([features[positive_index[i]] for i in range(num_positive) if i%cv != folder] + [features[negative_index[i]] for i in range(num_negative) if i%cv != folder]) assert(len(labels_val) == len(features_val)) assert(len(labels_train) == len(features_train)) assert((len(labels_val)+len(labels_train)) == n) for index,beta in enumerate(beta_set): model = hiksvm_train(labels_train, features_train, beta=beta) #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50) fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=dim, min_vals=min_vals, max_vals=max_vals, num_bins=50) results = [(labels_val[i], fikmodel.predict(features_val[i])) for i in range(len(labels_val))] results.sort(key=lambda v:(v[1]), reverse=True) sorted_labels = [x[0] for x in results] perf = scorer.score(sorted_labels) print "[find_best_beta] folder %d, beta %g -> %s=%g" % (folder, beta, scorer.name(), perf) perfs[index].append(perf) ranklist = [(beta_set[index], np.mean(perfs[index])) for index in range(len(beta_set))] ranklist.sort(key=lambda v:(v[1]), reverse=True) print "[find_best_beta] done", ranklist best_beta = ranklist[0][0] cv_score = ranklist[0][1] return best_beta, cv_score
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile( r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job nr_bins = options.nr_bins best_param_dir = options.best_param_dir beta = 0.5 modelName = 'fik%d' % nr_bins if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) feat_file = BigFile(feat_dir) params = {'nr_bins': nr_bins} with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) model = svm_train( y, vectors, svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI")) newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s' % model_file_name) fiksvm_save_model(model_file_name, newmodel) # reload the model file to do a simple check fiksvm_load_model(model_file_name) assert (abs(newmodel.get_probAB()[0] - A) < 1e-6) assert (abs(newmodel.get_probAB()[1] - B) < 1e-6) return len(todo)
with open(minmax_file, 'r') as f: min_vals = map(float, str.split(f.readline())) max_vals = map(float, str.split(f.readline())) [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath) name2label = dict(zip(names,labels)) (renamed, vectors) = train_feat_file.read(names) relabeled = [name2label[x] for x in renamed] #label is either 1 or -1 [names,labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath) test2label = dict(zip(names,labels)) for beta in [0.5]: #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: #model = hiksvm_train(relabeled, vectors, beta=beta) cv = 3 best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv, scorer, min_vals, max_vals) print best_beta, cv_score #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50) fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=feat_dim, min_vals=min_vals, max_vals=max_vals, num_bins=50) results = classifyLargeData(fikmodel, testImageSet, test_feat_file, prob_output=True) print results[:5] sorted_labels = [test2label[x] for x,y in results] score = scorer.score(sorted_labels) print "beta", beta, "AP", score
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job nr_bins = options.nr_bins best_param_dir = options.best_param_dir beta = 0.5 modelName = 'fik%d' % nr_bins if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 feat_dir = os.path.join(rootpath,trainCollection,'FeatureData',feature) feat_file = BigFile(feat_dir) params = {'nr_bins': nr_bins} with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) model = svm_train(y, vectors, svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI")) newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s'%model_file_name) fiksvm_save_model(model_file_name, newmodel) # reload the model file to do a simple check fiksvm_load_model(model_file_name) assert(abs(newmodel.get_probAB()[0]-A)<1e-6) assert(abs(newmodel.get_probAB()[1]-B)<1e-6) return len(todo)
def find_best_beta(labels, features, cv, scorer, min_vals, max_vals): dim = len(features[0]) beta_set = [x / 10.0 for x in range(1, 10)] perfs = [[] for i in range(len(beta_set))] n = len(labels) positive_index = [i for i in range(n) if 1 == labels[i]] negative_index = [i for i in range(n) if -1 == labels[i]] num_positive = len(positive_index) num_negative = len(negative_index) if num_positive < cv: message = "[find_best_beta] %d positive examples, insufficient for %d-fold cross-validation" % ( len(positive_index), cv) raise Exception(message) for folder in range(cv): print( "[find_best_beta] %d <- %s" % (folder, "-".join(map(str, [i for i in range(cv) if i != folder])))) labels_val = [1 for i in range(num_positive) if i % cv == folder] + [ -1 for i in range(num_negative) if i % cv == folder ] features_val = ([ features[positive_index[i]] for i in range(num_positive) if i % cv == folder ] + [ features[negative_index[i]] for i in range(num_negative) if i % cv == folder ]) labels_train = [1 for i in range(num_positive) if i % cv != folder] + [ -1 for i in range(num_negative) if i % cv != folder ] features_train = ([ features[positive_index[i]] for i in range(num_positive) if i % cv != folder ] + [ features[negative_index[i]] for i in range(num_negative) if i % cv != folder ]) assert (len(labels_val) == len(features_val)) assert (len(labels_train) == len(features_train)) assert ((len(labels_val) + len(labels_train)) == n) for index, beta in enumerate(beta_set): model = hiksvm_train(labels_train, features_train, beta=beta) #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50) fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=dim, min_vals=min_vals, max_vals=max_vals, num_bins=50) results = [(labels_val[i], fikmodel.predict(features_val[i])) for i in range(len(labels_val))] results.sort(key=lambda v: (v[1]), reverse=True) sorted_labels = [x[0] for x in results] perf = scorer.score(sorted_labels) print "[find_best_beta] folder %d, beta %g -> %s=%g" % ( folder, beta, scorer.name(), perf) perfs[index].append(perf) ranklist = [(beta_set[index], np.mean(perfs[index])) for index in range(len(beta_set))] ranklist.sort(key=lambda v: (v[1]), reverse=True) print "[find_best_beta] done", ranklist best_beta = ranklist[0][0] cv_score = ranklist[0][1] return best_beta, cv_score
[names, labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath) test2label = dict(zip(names, labels)) for beta in [0.5]: #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: #model = hiksvm_train(relabeled, vectors, beta=beta) cv = 3 best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv, scorer, min_vals, max_vals) print best_beta, cv_score #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50) fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=feat_dim, min_vals=min_vals, max_vals=max_vals, num_bins=50) results = classifyLargeData(fikmodel, testImageSet, test_feat_file, prob_output=True) print results[:5] sorted_labels = [test2label[x] for x, y in results] score = scorer.score(sorted_labels) print "beta", beta, "AP", score