def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature): rootpath = options.rootpath modelName = options.model if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = readConcepts(trainCollection, trainAnnotationName, rootpath) concepts = [ concepts[i] for i in range(len(concepts)) if (i % options.numjobs + 1) == options.job ] feat_file = None #BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature)) for concept in concepts: modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept) model = load_model(modelfile) (A0, B0) = model.get_probAB() if abs(A0) > 1e-8 and not options.overwrite: printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0)) continue names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) if not feat_file: feat_file = BigFile( os.path.join(rootpath, trainCollection, "FeatureData", feature)) results = classify_large_data(model, names, feat_file, prob_output=False) labels = [name2label[x[0]] for x in results] dec_values = [x[1] for x in results] printStatus( INFO, "%s +%d -%d" % (concept, len([x for x in labels if x == 1]), len([x for x in labels if x == -1]))) [A, B] = sigmoid_train(dec_values, labels) model.set_probAB(A, B) save_model(modelfile, model) (A1, B1) = model.get_probAB() printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
def process(options, collection, feature): rootpath = options.rootpath tpp = options.tpp k = 1000 #options.k numjobs = options.numjobs job = options.job overwrite = options.overwrite feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature) feat_file = BigFile(feat_dir) hitlists = buildHitLists(collection, tpp, rootpath) printStatus(INFO, 'nr of tags: %d' % len(hitlists)) vob = sorted(hitlists.keys()) vob = [vob[i] for i in range(len(vob)) if i%numjobs == job-1] printStatus(INFO, 'working on %d-%d: %d tags' % (numjobs, job, len(vob))) for tag_idx,tag in enumerate(vob): resultdir = os.path.join(rootpath, collection, 'FeatureIndex', feature, tag[:2], tag) binfile = os.path.join(resultdir, 'feature.bin') if checkToSkip(binfile, overwrite): continue hitlist = hitlists[tag] hitlist = hitlist[:k] # keep at most 1000 images per tag renamed,vecs = feat_file.read(hitlist) makedirsforfile(binfile) np.array(vecs).astype(np.float32).tofile(binfile) idfile = os.path.join(resultdir, 'id.txt') fw = open(idfile, 'w') fw.write(' '.join(renamed)) fw.close() shapefile = os.path.join(resultdir, 'shape.txt') fw = open(shapefile, 'w') fw.write('%d %d' % (len(renamed), len(vecs[0]))) fw.close() if tag_idx%1e3 == 0: printStatus(INFO, '%d - %s, %d images' % (tag_idx, tag, len(hitlist)))
def process(options, collection, feature): rootpath = options.rootpath tpp = options.tpp k = 1000 # options.k numjobs = options.numjobs job = options.job overwrite = options.overwrite feat_dir = os.path.join(rootpath, collection, "FeatureData", feature) feat_file = BigFile(feat_dir) hitlists = buildHitLists(collection, tpp, rootpath) printStatus(INFO, "nr of tags: %d" % len(hitlists)) vob = sorted(hitlists.keys()) vob = [vob[i] for i in range(len(vob)) if i % numjobs == job - 1] printStatus(INFO, "working on %d-%d: %d tags" % (numjobs, job, len(vob))) for tag_idx, tag in enumerate(vob): resultdir = os.path.join(rootpath, collection, "FeatureIndex", feature, tag[:2], tag) binfile = os.path.join(resultdir, "feature.bin") if checkToSkip(binfile, overwrite): continue hitlist = hitlists[tag] hitlist = hitlist[:k] # keep at most 1000 images per tag renamed, vecs = feat_file.read(hitlist) makedirsforfile(binfile) np.array(vecs).astype(np.float32).tofile(binfile) idfile = os.path.join(resultdir, "id.txt") fw = open(idfile, "w") fw.write(" ".join(renamed)) fw.close() shapefile = os.path.join(resultdir, "shape.txt") fw = open(shapefile, "w") fw.write("%d %d" % (len(renamed), len(vecs[0]))) fw.close() if tag_idx % 1e3 == 0: printStatus(INFO, "%d - %s, %d images" % (tag_idx, tag, len(hitlist)))
def process(options, testCollection, trainCollection, annotationName, feature): rootpath = options.rootpath k = options.k distance = options.distance blocksize = options.blocksize donefile = options.donefile numjobs = options.numjobs job = options.job overwrite = options.overwrite taggerType = options.tagger noise = options.noise testset = options.testset if not testset: testset = testCollection modelName = taggerType if 'pretagvote' == taggerType and noise > 1e-3: modelName += '-noise%.2f' % noise if 'pqtagvote' == taggerType: nnName = "l2knn" else: nnName = distance + "knn" resultfile = os.path.join(rootpath, testCollection, 'autotagging', testset, trainCollection, annotationName, modelName, '%s,%s,%d' % (feature, nnName, k), 'id.tagvotes.txt') if numjobs > 1: resultfile += ".%d.%d" % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 if donefile: doneset = set([x.split()[0] for x in open(donefile) if x.strip()]) else: doneset = set() printStatus( INFO, "%d images have been done already, and they will be ignored" % len(doneset)) workingSet = readImageSet(testCollection, testset, rootpath) workingSet = [x for x in workingSet if x not in doneset] workingSet = [ workingSet[i] for i in range(len(workingSet)) if (i % numjobs + 1) == job ] test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature) test_feat_file = BigFile(test_feat_dir) tagger = NAME_TO_TAGGER[taggerType](trainCollection, annotationName, feature, distance, rootpath=rootpath) tagger.k = k tagger.noise = noise printStatus( INFO, "working on %d-%d, %d test images -> %s" % (numjobs, job, len(workingSet), resultfile)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0.0 test_time = 0.0 start = 0 done = 0 while start < len(workingSet): end = min(len(workingSet), start + blocksize) printStatus(INFO, 'tagging images from %d to %d' % (start, end - 1)) s_time = time.time() renamed, vectors = test_feat_file.read(workingSet[start:end]) nr_images = len(renamed) read_time += time.time() - s_time s_time = time.time() output = [None] * nr_images for i in range(nr_images): tagvotes = tagger.predict(content=vectors[i], context='%s,%s' % (testCollection, renamed[i])) output[i] = '%s %s\n' % (renamed[i], " ".join([ "%s %s" % (tag, niceNumber(vote, 6)) for (tag, vote) in tagvotes ])) test_time += time.time() - s_time start = end fw.write(''.join(output)) done += len(output) fw.close() printStatus( INFO, '%d images tagged, read time %g seconds, test time %g seconds' % (done, read_time, test_time))
def process(options, trainCollection, annotationfile, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [ x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#') ] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames) + 1): names, labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t - 1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job blocksize = options.blocksize outputName = '%s,%s' % (feature, modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs > 1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [ test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job ] nr_of_test_images = len(test_imset) printStatus( INFO, "working on %d-%d, %d test images -> %s" % (numjobs, job, nr_of_test_images, resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName, '%s.model' % concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = BigFile( os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0 test_time = 0 start = 0 done = 0 while start < nr_of_test_images: end = min(nr_of_test_images, start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end - 1)) s_time = time.time() renamed, test_X = feat_file.read(test_imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): if prob_output: scores = [ models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts) ] else: scores = [ models[c].predict(test_X[i]) for c in range(nr_of_concepts) ] #dec_value = sigmoid_predict(dec_value, A=pA, B=pB) tagvotes = sorted(zip(concepts, scores), key=lambda v: v[1], reverse=True) output[i] = '%s %s\n' % (renamed[i], " ".join([ "%s %s" % (tag, niceNumber(vote, 6)) for (tag, vote) in tagvotes ])) test_time += time.time() - s_time start = end fw.write(''.join(output)) fw.flush() done += len(output) # done printStatus( INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time)) fw.close() return done
from basic.constant import ROOT_PATH rootpath = ROOT_PATH conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', trainAnnotationName) from basic.annotationtable import writeConceptsTo writeConceptsTo(test_tags, trainCollection, trainAnnotationName) cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % ( parent_dir, trainCollection, conceptfile) os.system('python ' + cmd) train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) from util.simpleknn.bigfile import BigFile train_feat_file = BigFile(train_feat_dir) feat_dim = train_feat_file.ndims from basic.util import readImageSet test_imset = readImageSet(testCollection) test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature) test_feat_file = BigFile(test_feat_dir) #test_renamed, test_vectors = test_feat_file.read(test_imset) from model_based.dataengine.positiveengine import PositiveEngine from model_based.dataengine.negativeengine import NegativeEngine pe = PositiveEngine(trainCollection) ne = NegativeEngine(trainCollection) for tag in test_tags:
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job blocksize = options.blocksize outputName = '%s,%s' % (feature,modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = BigFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0 test_time = 0 start = 0 done = 0 while start < nr_of_test_images: end = min(nr_of_test_images, start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed, test_X = feat_file.read(test_imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): if prob_output: scores = [models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts)] else: scores = [models[c].predict(test_X[i]) for c in range(nr_of_concepts)] #dec_value = sigmoid_predict(dec_value, A=pA, B=pB) tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True) output[i] = '%s %s\n' % (renamed[i], " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) test_time += time.time() - s_time start = end fw.write(''.join(output)) fw.flush() done += len(output) # done printStatus(INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time)) fw.close() return done
def process(options, trainCollection, baseAnnotationName, startAnnotationName, feature, modelName): global train_model, compress_model, save_model assert (modelName in ['fik', 'fastlinear']) if 'fik' == modelName: from model_based.svms.fiksvm.svmutil import svm_train as train_model from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model else: from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model rootpath = options.rootpath overwrite = options.overwrite params = { 'rootpath': rootpath, 'trainCollection': trainCollection, 'baseAnnotationName': baseAnnotationName, 'startAnnotationName': startAnnotationName, 'feature': feature, 'model': modelName, 'strategy': options.strategy, 'iterations': options.iterations, 'npr': options.npr, 'nr_bins': options.nr_bins } concepts = readConcepts(trainCollection, startAnnotationName, rootpath) newAnnotationName = get_new_annotation_name(params) newModelName = get_model_name(params) modeldir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, newModelName) todo = [ concept for concept in concepts if overwrite or os.path.exists(os.path.join(modeldir, '%s.txt' % concept)) is False ] activeConcepts = [ todo[i] for i in range(len(todo)) if (i % options.numjobs + 1) == options.job ] params['feat_file'] = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) if 'fik' == modelName: minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) s_time = time.time() for concept in activeConcepts: printStatus(INFO, 'processing %s' % concept) modelfile = os.path.join(modeldir, '%s.model' % concept) if checkToSkip(modelfile, overwrite): continue new_model = NegativeBootstrap.learn(concept, params) makedirsforfile(modelfile) printStatus(INFO, 'save model to %s' % modelfile) save_model(modelfile, new_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus( INFO, 'done for %g concepts: %s' % (len(activeConcepts), ' '.join(activeConcepts))) printStatus(INFO, 'models stored at %s' % modeldir) printStatus(INFO, '%g seconds in total' % timecost)
testCollection = 'mirflickr08' from basic.constant import ROOT_PATH rootpath = ROOT_PATH conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', trainAnnotationName) from basic.annotationtable import writeConceptsTo writeConceptsTo(test_tags, trainCollection, trainAnnotationName) cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % (parent_dir, trainCollection, conceptfile) os.system('python ' + cmd) train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) from util.simpleknn.bigfile import BigFile train_feat_file = BigFile(train_feat_dir) feat_dim = train_feat_file.ndims from basic.util import readImageSet test_imset = readImageSet(testCollection) test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature) test_feat_file = BigFile(test_feat_dir) #test_renamed, test_vectors = test_feat_file.read(test_imset) from model_based.dataengine.positiveengine import PositiveEngine from model_based.dataengine.negativeengine import NegativeEngine pe = PositiveEngine(trainCollection) ne = NegativeEngine(trainCollection)
def process(options, testCollection, trainCollection, feature): rootpath = options.rootpath overwrite = options.overwrite tpp = options.tpp doRandomwalk = 1 #options.doRandomwalk uniqueUser = 0 #options.uniqueUser k = 1000 #options.k numjobs = options.numjobs job = options.job #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, # "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt") resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, '%s,tagrank,%s' % (feature, tpp), 'id.tagvotes.txt') if numjobs > 1: resultfile = resultfile + '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): sys.exit(0) try: doneset = set( [str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [ testImageSet[i] for i in range(len(testImageSet)) if (i % numjobs + 1) == job ] printStatus( INFO, 'working on %d-%d, %d test images -> %s' % (numjobs, job, len(testImageSet), resultfile)) testreader = TagReader(testCollection, rootpath=rootpath) test_feat_file = BigFile( os.path.join(rootpath, testCollection, 'FeatureData', feature)) block_size = 100 tagranking = TagRanking(trainCollection, feature=feature, k=k, rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 nr_of_blocks = len(testImageSet) / block_size if nr_of_blocks * block_size < len(testImageSet): nr_of_blocks += 1 for block_index in range(nr_of_blocks): start = block_index * block_size end = min(len(testImageSet), start + block_size) subset = testImageSet[start:end] if not subset: break renamed, features = test_feat_file.read(subset) printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset))) output = [] for i in range(len(renamed)): qry_id = renamed[i] qry_tags = testreader.get(qry_id) qry_vec = features[i] tagvotes = tagranking.estimate( qry_vec, qry_tags) #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk) newline = "%s %s" % (qry_id, " ".join( ["%s %g" % (x[0], x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] ))) fw.write("".join(output)) fw.flush() fw.close() printStatus(INFO, 'done')
def process(options, testCollection, trainCollection, feature): rootpath = options.rootpath overwrite = options.overwrite tpp = options.tpp doRandomwalk = 1 #options.doRandomwalk uniqueUser = 0 #options.uniqueUser k = 1000 #options.k numjobs = options.numjobs job = options.job #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, # "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt") resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, '%s,tagrank,%s' % (feature,tpp), 'id.tagvotes.txt') if numjobs>1: resultfile = resultfile + '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): sys.exit(0) try: doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job] printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) ) testreader = TagReader(testCollection, rootpath=rootpath) test_feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) block_size = 100 tagranking = TagRanking(trainCollection, feature=feature, k=k, rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 nr_of_blocks = len(testImageSet) / block_size if nr_of_blocks * block_size < len(testImageSet): nr_of_blocks += 1 for block_index in range(nr_of_blocks): start = block_index * block_size end = min(len(testImageSet), start + block_size) subset = testImageSet[start:end] if not subset: break renamed, features = test_feat_file.read(subset) printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset))) output = [] for i in range(len(renamed)): qry_id = renamed[i] qry_tags = testreader.get(qry_id) qry_vec = features[i] tagvotes = tagranking.estimate(qry_vec, qry_tags) #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk) newline = "%s %s" % (qry_id, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] ))) fw.write("".join(output)) fw.flush() fw.close() printStatus(INFO, 'done')
'tags': '' } for x in selected] resp['content'] = content[:max_hits] return render.index(resp) class ImageSearch: def POST(self): input = web.input() raise web.seeother('/?query=%s' % input.query) if __name__ == "__main__": app = web.application(urls, globals()) test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature) web.test_feat_file = BigFile(test_feat_dir) from knntagger import TagVoteTagger as ImageTagger web.tagger = ImageTagger(trainCollection, rootpath) train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) web.searcher = load_model(train_feat_dir) web.searcher.set_distance(distance) web.imset = readImageSet(testCollection, testCollection, rootpath) app.run()
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath best_param_dir = options.best_param_dir overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job beta = 0.5 modelName = 'fastlinear' if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] if not todo: return 0 printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn # no bias term added by setting "-B -1" svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) model = liblinear_train(y, vectors, svm_params) newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s'%model_file_name) fastlinear_save_model(model_file_name, newmodel) # reload the model file to do a simple check fastlinear_load_model(model_file_name) assert(abs(newmodel.get_probAB()[0]-A)<1e-6) assert(abs(newmodel.get_probAB()[1]-B)<1e-6) return len(todo)
def process(options, trainCollection, testCollection, feature): rootpath = options.rootpath k = options.k distance = options.distance blocksize = options.blocksize uniqueUser = options.uu numjobs = options.numjobs job = options.job overwrite = options.overwrite testset = options.testset if not testset: testset = testCollection searchMethod = distance + 'knn' if uniqueUser: searchMethod += ",uu" tagfile = os.path.join(rootpath, trainCollection, 'TextData', 'id.userid.lemmtags.txt') im2user = {} for line in open(tagfile): im,userid,tags = line.split('\t') im2user[im] = userid resultdir = os.path.join(rootpath, testCollection, "SimilarityIndex", testset, trainCollection, "%s,%s,%d" % (feature,searchMethod,k)) feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir, 'shape.txt') nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) nr_of_images = len(open(id_file).readline().strip().split()) searcher = imagesearch.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, nr_of_images, id_file) searcher.set_distance(distance) workingSet = readImageSet(testCollection, testset, rootpath=rootpath) workingSet = [workingSet[i] for i in range(len(workingSet)) if (i%numjobs+1) == job] printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,len(workingSet),resultdir)) test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature) test_feat_file = BigFile(test_feat_dir) read_time = 0 knn_time = 0 start = 0 done = 0 filtered = 0 while start < len(workingSet): end = min(len(workingSet), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed,vectors = test_feat_file.read(workingSet[start:end]) read_time += time.time() - s_time nr_images = len(renamed) s_time = time.time() for i in range(nr_images): resultfile = os.path.join(resultdir, renamed[i][-2:], '%s.txt' % renamed[i]) if checkToSkip(resultfile, overwrite): continue knn = searcher.search_knn(vectors[i], max_hits=max(3000,k*3)) if uniqueUser: removed, newknn = unique_user_constraint(knn, im2user, k) filtered += removed knn = newknn else: knn = knn[:k] assert(len(knn) >= k) writeRankingResults(knn, resultfile) done += 1 printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered)) start = end printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered))
def process(options, trainCollection, annotationfile, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames)+1): names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
sys.exit(0) nr_of_images_list = [] feat_dim_list = [] feat_files = [] for feature in srcfeatures: shapefile = os.path.join(rootpath, collection, 'FeatureData', feature, 'shape.txt') nr_of_images, feat_dim = map( int, open(shapefile).readline().strip().split()) nr_of_images_list.append(nr_of_images) feat_dim_list.append(feat_dim) feat_files.append( BigFile(os.path.join(rootpath, collection, 'FeatureData', feature))) #assert(nr_of_images_list[0] == nr_of_images_list[1]) new_feat_dim = sum(feat_dim_list) imset = readImageSet(collection, collection, rootpath) nr_of_images = len(imset) blocksize = 1000 makedirsforfile(binary_file) fw = open(binary_file, 'wb') new_imset = [] start = 0 while start < nr_of_images: end = min(nr_of_images, start + blocksize)