def process(options, collection, annotationName, simdir, resultfile): rootpath = options.rootpath if checkToSkip(resultfile, options.overwrite): return 0 concepts = readConcepts(collection, annotationName, rootpath=rootpath) concept_num = len(concepts) id_images = readImageSet(collection, collection, rootpath) image_num = len(id_images) im2index = dict(zip(id_images, range(image_num))) print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile)) scores = np.zeros((image_num, concept_num)) - 1e4 for c_id,concept in enumerate(concepts): simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) for im,score in ranklist: idx = im2index[im] scores[idx,c_id] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1) output.close()
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH, k=DEFAULT_K): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict( zip(self.concepts, range(self.nr_of_concepts))) self.imset = readImageSet(collection, collection, rootpath) self.nr_of_images = len(self.imset) self.knndir = os.path.join(collection, '%s,%sknn,1500' % (feature, distance)) self.k = k self.noise = 0 self._load_tag_data(collection, tpp, rootpath) printStatus( INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len( self.tag2freq), distance, self.k))
def process(options, collection, annotationName, simdir, resultfile): rootpath = options.rootpath if checkToSkip(resultfile, options.overwrite): return 0 concepts = readConcepts(collection, annotationName, rootpath=rootpath) concept_num = len(concepts) id_images = readImageSet(collection, collection, rootpath) image_num = len(id_images) im2index = dict(zip(id_images, range(image_num))) print('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile)) scores = np.zeros((image_num, concept_num)) - 1e4 for c_id, concept in enumerate(concepts): simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) for im, score in ranklist: idx = im2index[im] scores[idx, c_id] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': scores }, output, -1) output.close()
def process(options, testCollection, trainCollection, tagsimMethod): rootpath = options.rootpath overwrite = options.overwrite testsetName = options.testset if options.testset else testCollection tpp = options.tpp numjobs = options.numjobs job = options.job useWnVob = 1 outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod if tagsimMethod == 'wns': resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt') else: resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt') if numjobs>1: resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job)) if checkToSkip(resultfile, overwrite): sys.exit(0) makedirsforfile(resultfile) try: doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job] printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) ) testreader = TagReader(testCollection, rootpath=rootpath) if tagsimMethod == "wns": tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath) else: tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath) done = 0 fw = open(resultfile, "w") for qry_id in testImageSet: qry_tags = testreader.get(qry_id) tagvotes = tagrel.estimate(qry_tags) newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes]) fw.write(newline+"\n") done += 1 if done%1000 == 0: printStatus(INFO, "%d done" % done) # done fw.close() printStatus(INFO, "%d done" % done)
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): assert(modelName.startswith('fastlinear')) rootpath = options.rootpath overwrite = options.overwrite numjobs = options.numjobs job = options.job topk = options.topk outputName = '%s,%s' % (feature,modelName) resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] test_imset = set(test_imset) nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath) feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 feat_file.open() for _id, _vec in feat_file: if _id not in test_imset: continue res = ma.predict([_vec],prob=0) tagvotes = res[0] if topk>0: tagvotes = tagvotes[:topk] newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) fw.write(newline) done += 1 if done % 1e4 == 0: printStatus(INFO, "%d done" % done) feat_file.close() fw.close() printStatus(INFO, "%d done" % (done)) return done
def process(options, testCollection): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding_model = options.embedding Y0 = options.Y0 Y1 = options.Y1 pY0 = options.pY0 r = options.r blocksize = 2000 embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model) for synset_name in [Y0, Y1]: assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name))) resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt') if checkToSkip(resfile, overwrite): return 0 label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name) i2v = Image2Vec(label_file, label2vec_dir) tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath) imset = readImageSet(testCollection, testCollection, rootpath) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0) feat_file = BigFile(feat_dir) printStatus(INFO, 'tagging %d images' % len(imset)) makedirsforfile(resfile) fw = open(resfile, 'w') start = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end)) todo = imset[start:end] if not todo: break renamed, vectors = feat_file.read(todo) output = [] for _id,_vec in zip(renamed, vectors): im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec, topk=options.r) output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred]))) start = end fw.write(''.join(output)) fw.close()
def process(options, collection): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s' % tpp, 'id.tagvotes.txt') if checkToSkip(resultfile, overwrite): sys.exit(0) imset = readImageSet(collection, collection, rootpath) printStatus(INFO, 'working on %d test images -> %s' % (len(imset), resultfile)) reader = TagReader(collection, tpp=tpp, rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") output = [] done = 0 for im in imset: tags = reader.get(im) tagSet = set() tagSeq = [] for tag in str.split(tags): if tag not in tagSet: tagSeq.append(tag) tagSet.add(tag) assert (len(tagSeq) == len(tagSet)) nr_tags = len(tagSeq) tagvotes = [(tagSeq[i], 1.0 - float(i) / nr_tags) for i in range(nr_tags)] newline = "%s %s" % (im, " ".join( ["%s %g" % (x[0], x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 if len(output) % 1e4 == 0: printStatus( INFO, '%d %s %s' % (done, im, ' '.join( ['%s:%g' % (x[0], x[1]) for x in tagvotes[:3]]))) fw.write("".join(output)) fw.flush() output = [] if output: fw.write("".join(output)) fw.close() printStatus(INFO, 'done')
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts))) self.imset = readImageSet(collection, collection, rootpath) self.nr_of_images = len(self.imset) self.knndir = os.path.join(collection, '%s,%sknn,uu,1500' % (feature, distance)) self.k = DEFAULT_K self.noise = 0 self._load_tag_data(collection, tpp, rootpath) printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open( os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x, -1), y) for x, y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand( tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s'%tpp, 'id.tagvotes.txt') if checkToSkip(resultfile, overwrite): sys.exit(0) imset = readImageSet(collection, collection, rootpath) printStatus(INFO, 'working on %d test images -> %s' % (len(imset),resultfile)) reader = TagReader(collection,tpp=tpp,rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") output = [] done = 0 for im in imset: tags = reader.get(im) tagSet = set() tagSeq = [] for tag in str.split(tags): if tag not in tagSet: tagSeq.append(tag) tagSet.add(tag) assert(len(tagSeq) == len(tagSet)) nr_tags = len(tagSeq) tagvotes = [(tagSeq[i], 1.0-float(i)/nr_tags) for i in range(nr_tags)] newline = "%s %s" % (im, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 if len(output)%1e4 == 0: printStatus(INFO, '%d %s %s' % (done,im,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] ))) fw.write("".join(output)) fw.flush() output = [] if output: fw.write("".join(output)) fw.close() printStatus(INFO, 'done')
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat') if checkToSkip(resultfile, overwrite): return 0 workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() tot_images = len(workingSet) printStatus(INFO, '%d images' % (tot_images)) K_neighs = int(math.floor(len(workingSet) * k_ratio)) printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Allocating I,J,V arrays') I = np.zeros((K_neighs * tot_images * 2)) J = np.zeros((K_neighs * tot_images * 2)) V = np.zeros((K_neighs * tot_images * 2)) n_entries = 0 # distances printStatus(INFO, 'Starting to fill I,J,V arrays') for i in xrange(tot_images): try: neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs*2, feature, distance) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] except ValueError: printStatus(INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i])) sys.exit(1) if len(neighbors) < K_neighs: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs)) sys.exit(1) if (i+1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i+1, tot_images)) for k in xrange(K_neighs): if i != int(NNrow[k]): # -1 zero on the diagonal for a later step I[n_entries] = i J[n_entries] = int(NNrow[k]) # -1 V[n_entries] = NNDrow[k] n_entries += 1 I[n_entries] = int(NNrow[k]) # -1 J[n_entries] = i V[n_entries] = NNDrow[k] n_entries += 1 I = I[0:n_entries] J = J[0:n_entries] V = V[0:n_entries] printStatus(INFO, 'Removing duplicates') ind = np.lexsort((V,J,I)) I = I[ind] J = J[ind] V = V[ind] a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) del a _, idx = np.unique(b, return_index=True) del b I = I[idx] J = J[idx] V = V[idx] printStatus(INFO, 'Computing the final laplacian matrix') sigma = np.median(V) ** 2.; printStatus(INFO, 'Estimated sigma^2 = %f' % sigma) V = np.exp(-V / sigma) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr() new_diag = matrix.sum(axis=0).T V = -V I_add = np.zeros((tot_images)) J_add = np.zeros((tot_images)) V_add = np.zeros((tot_images)) for i,v in enumerate(new_diag): I_add[i] = i J_add[i] = i V_add[i] = v I = np.append(I, I_add) J = np.append(J, J_add) V = np.append(V, V_add) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil() printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile) makedirsforfile(resultfile) scipy.io.savemat(resultfile, {'im_similarity' : matrix, 'sigma' : sigma})
def process(options, testCollection, trainCollection, annotationName, feature): rootpath = options.rootpath k = options.k distance = options.distance overwrite = options.overwrite testset = testCollection onlytest = options.onlytest nnName = distance + "knn" resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d' % (feature, nnName, k), 'nn_train.h5') resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d' % (feature, nnName, k), 'nn_test.h5') if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip( resultfile_test, overwrite): return 0 testSet = readImageSet(testCollection, testset, rootpath) trainSet = readImageSet(trainCollection, trainCollection, rootpath) testSet.sort() trainSet.sort() #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) #train_feat_file = BigFile(train_feat_dir) tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001) printStatus( INFO, '%d test images, %d train images' % (len(testSet), len(trainSet))) # allocate train -> train nearest neighbors if not onlytest: printStatus(INFO, 'Allocating NN, NND matrices') NN = np.zeros((len(trainSet), k + 1), dtype=np.int32) NND = np.zeros((len(trainSet), k + 1)) printStatus(INFO, 'Filling NN, NND matrices') for i, id_img in enumerate(trainSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img)) if len(neighbors) < k + 1: printStatus( INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k + 1)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NN[i, :] = NNrow[0:k + 1] NND[i, :] = NNDrow[0:k + 1] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(trainSet))) printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train)) makedirsforfile(resultfile_train) fout = h5py.File(resultfile_train, 'w') fout['NN'] = NN fout['NND'] = NND fout['trainSet'] = trainSet fout['concepts'] = tagger.concepts fout.close() del NN del NND # allocate test -> train nearest neighbors printStatus(INFO, 'Allocating NNT, NNDT matrices') NNT = np.zeros((len(testSet), k), dtype=np.int32) NNDT = np.zeros((len(testSet), k)) printStatus(INFO, 'Filling NNT, NNDT matrices') for i, id_img in enumerate(testSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img)) if len(neighbors) < k: printStatus( INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NNT[i, :] = NNrow[0:k] NNDT[i, :] = NNDrow[0:k] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(testSet))) printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test)) makedirsforfile(resultfile_test) fout = h5py.File(resultfile_test, 'w') fout['NNT'] = NNT fout['NNDT'] = NNDT fout['trainSet'] = trainSet fout['testSet'] = testSet fout['concepts'] = tagger.concepts fout.close()
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl): rootpath = options.rootpath k = options.k distance = options.distance variant = options.variant overwrite = options.overwrite testset = testCollection forcetrainmodel = options.trainmodel modelName = "tagprop" nnName = distance + "knn" printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature)) resultfile = os.path.join(outputpkl) resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat') # if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite): # return 0 tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5') if not os.path.exists(tagmatrix_file): printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file)) sys.exit(1) train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5') if not os.path.exists(train_neighs_file): printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file)) sys.exit(1) # do we need to perform learning? train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat') # if os.path.exists(train_model_file) and not forcetrainmodel: if False: printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file)) else: printStatus(INFO, "starting learning model for %s" % (trainCollection)) makedirsforfile(train_model_file) # print(tagmatrix_file, train_neighs_file) # exit() script = """ tagprop_path = '%s/model_based/tagprop/TagProp/'; addpath(tagprop_path); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NN = h5read('%s', '/NN'); NN = NN(2:end, :); NN = double(NN); """ % (survey_code, tagmatrix_file, train_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NND = h5read('%s', '/NND'); NND = NND(2:end, :); NND = reshape(NND, 1, size(NND,1), size(NND,2)); NND = double(NND); """ % train_neighs_file if variant == 'rank': script += """ m = tagprop_learn(NN,[],tagmatrix); """ elif variant == 'ranksigmoids': script += """ m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true); """ elif variant == 'dist': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist'); """ elif variant == 'distsigmoids': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true); """ script += """ save('%s', 'm', '-v7.3'); """ % train_model_file # call_matlab(script) # print(script) # exit() # we perform prediction printStatus(INFO, "starting prediction") test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5') if not os.path.exists(test_neighs_file): printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file)) sys.exit(1) script += """ tagprop_path = '%s/model_based/tagprop/TagProp/'; addpath(tagprop_path); load('%s'); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NNT = h5read('%s', '/NNT'); NNT = double(NNT); """ % (survey_code, train_model_file, tagmatrix_file, test_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NNDT = h5read('%s', '/NNDT'); NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2)); NNDT = double(NNDT); """ % test_neighs_file script += """ P = tagprop_predict(NNT,[],m)'; save('%s', '-v7.3'); exit; """ % resultfile_tagprop # print(script) makedirsforfile(resultfile_tagprop) call_matlab(script) # exit() # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(testCollection, annotationName, rootpath) id_images = readImageSet(testCollection, testset, rootpath) id_images.sort() # id_images = map(int, id_images) # concepts mapping tagprop_output = h5py.File(resultfile_tagprop, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts) final_tagmatrix = tagprop_output['P'][:][:,mapping] with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, k_ratio), 'laplacianI.mat') if checkToSkip(resultfile, overwrite): return 0 workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() tot_images = len(workingSet) printStatus(INFO, '%d images' % (tot_images)) K_neighs = int(math.floor(len(workingSet) * k_ratio)) printStatus( INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Allocating I,J,V arrays') I = np.zeros((K_neighs * tot_images * 2)) J = np.zeros((K_neighs * tot_images * 2)) V = np.zeros((K_neighs * tot_images * 2)) n_entries = 0 # distances printStatus(INFO, 'Starting to fill I,J,V arrays') for i in xrange(tot_images): try: neighbors = _get_neighbors( '%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] except ValueError: printStatus( INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i])) sys.exit(1) if len(neighbors) < K_neighs: printStatus( INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs)) sys.exit(1) if (i + 1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i + 1, tot_images)) for k in xrange(K_neighs): if i != int(NNrow[k]): # -1 zero on the diagonal for a later step I[n_entries] = i J[n_entries] = int(NNrow[k]) # -1 V[n_entries] = NNDrow[k] n_entries += 1 I[n_entries] = int(NNrow[k]) # -1 J[n_entries] = i V[n_entries] = NNDrow[k] n_entries += 1 I = I[0:n_entries] J = J[0:n_entries] V = V[0:n_entries] printStatus(INFO, 'Removing duplicates') ind = np.lexsort((V, J, I)) I = I[ind] J = J[ind] V = V[ind] a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T b = np.ascontiguousarray(a).view( np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) del a _, idx = np.unique(b, return_index=True) del b I = I[idx] J = J[idx] V = V[idx] printStatus(INFO, 'Computing the final laplacian matrix') sigma = np.median(V)**2. printStatus(INFO, 'Estimated sigma^2 = %f' % sigma) V = np.exp(-V / sigma) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr() new_diag = matrix.sum(axis=0).T V = -V I_add = np.zeros((tot_images)) J_add = np.zeros((tot_images)) V_add = np.zeros((tot_images)) for i, v in enumerate(new_diag): I_add[i] = i J_add[i] = i V_add[i] = v I = np.append(I, I_add) J = np.append(J, J_add) V = np.append(V, V_add) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil() printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile) makedirsforfile(resultfile) scipy.io.savemat(resultfile, {'im_similarity': matrix, 'sigma': sigma})
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job #blocksize = options.blocksize topk = options.topk outputName = '%s,%s' % (feature,modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] test_imset = set(test_imset) nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 feat_file.open() for _id, _vec in feat_file: if _id not in test_imset: continue if prob_output: scores = [models[c].predict_probability(_vec) for c in range(nr_of_concepts)] else: scores = [models[c].predict(_vec) for c in range(nr_of_concepts)] tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True) if topk>0: tagvotes = tagvotes[:topk] newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) fw.write(newline) done += 1 if done % 1e4 == 0: printStatus(INFO, "%d done" % done) feat_file.close() fw.close() printStatus(INFO, "%d done" % (done)) return done
def process(options, workingCollection, annotationName, feature, outputpkl): rootpath = options.rootpath distance = options.distance overwrite = options.overwrite k_ratio = options.kratio ratio_cs = options.ratiocs lambda1 = options.lambda1 lambda2 = options.lambda2 outputonlytest = options.outputonlytest rawtagmatrix = options.rawtagmatrix modelName = "robustpca" nnName = distance + "knn" printStatus(INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2)) if rawtagmatrix: printStatus(INFO, "Using raw tag matrix.") else: printStatus(INFO, "Using preprocessed tag matrix.") resultfile = os.path.join(outputpkl) resultfile_robustpca = os.path.join(rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d'%(feature,nnName,lambda1,lambda2,k_ratio,rawtagmatrix), 'prediction.mat') if checkToSkip(resultfile_robustpca, overwrite): only_dump = True else: only_dump = False if not rawtagmatrix: tagmatrix_file = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,DEFAULT_K_PROP), 'tagmatrix.h5') if not os.path.exists(tagmatrix_file): printStatus(INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file)) sys.exit(1) else: tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus(INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus(INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file)) sys.exit(1) laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat') if not os.path.exists(laplacianT_file): printStatus(INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file)) sys.exit(1) # being learning script = """ rpca_path = 'transduction_based/robustpca/'; addpath(rpca_path); addpath([rpca_path, 'fast_svd/']); tagmatrix = sparse(double(h5read('%s', '/tagmatrix'))); load('%s'); load('%s'); lambda1 = %f; lambda2 = %f; maxIters = 50; precision = 1e-4; mu_start = 1.; parpool('local', 4); [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start); """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2) script += """ delete(gcp); save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3'); exit; """ % resultfile_robustpca if not only_dump: printStatus(INFO, "starting learning") makedirsforfile(resultfile_robustpca) call_matlab(script) if checkToSkip(resultfile, overwrite): return 0 # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(workingCollection, annotationName, rootpath) if outputonlytest: testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() id_images = readImageSet(workingCollection, workingCollection, rootpath) id_images.sort() # concepts mapping robustpca_output = h5py.File(resultfile_robustpca, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts) predicted_tagmatrix = robustpca_output['P'][:,mapping] if outputonlytest: idx = np.array([bisect_index(id_images, x) for x in testset_id_images]) final_tagmatrix = predicted_tagmatrix[idx, :] assert(final_tagmatrix.shape[0] == idx.shape[0]) id_images = testset_id_images else: final_tagmatrix = predicted_tagmatrix makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images': id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
import sys, os, random from basic.common import ROOT_PATH from basic.util import readImageSet from simpleknn.bigfile import BigFile if __name__ == '__main__': rootpath = ROOT_PATH collection = sys.argv[1] feature = sys.argv[2] imset = readImageSet(collection, collection) feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature) feat_file = BigFile(feat_dir) imset = random.sample(imset, 50) #imset = imset[:5] renamed,vectors = feat_file.read(imset) for name,vec in zip(renamed,vectors): print name, sum(vec), sum(vec[:64]), sum(vec[64:])
def process(options, trainCollection, testCollection, feature): rootpath = options.rootpath k = options.k distance = options.distance blocksize = options.blocksize uniqueUser = options.uu numjobs = options.numjobs job = options.job overwrite = options.overwrite testset = options.testset if not testset: testset = testCollection searchMethod = distance + 'knn' if uniqueUser: searchMethod += ",uu" tagfile = os.path.join(rootpath, trainCollection, 'TextData', 'id.userid.lemmtags.txt') im2user = {} for line in open(tagfile): im,userid,tags = line.split('\t') im2user[im] = userid resultdir = os.path.join(rootpath, testCollection, "SimilarityIndex", testset, trainCollection, "%s,%s,%d" % (feature,searchMethod,k)) feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir, 'shape.txt') nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) nr_of_images = len(open(id_file).readline().strip().split()) searcher = imagesearch.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, nr_of_images, id_file) searcher.set_distance(distance) workingSet = readImageSet(testCollection, testset, rootpath=rootpath) workingSet = [workingSet[i] for i in range(len(workingSet)) if (i%numjobs+1) == job] printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,len(workingSet),resultdir)) test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature) test_feat_file = BigFile(test_feat_dir) read_time = 0 knn_time = 0 start = 0 done = 0 filtered = 0 while start < len(workingSet): end = min(len(workingSet), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed,vectors = test_feat_file.read(workingSet[start:end]) read_time += time.time() - s_time nr_images = len(renamed) s_time = time.time() for i in range(nr_images): resultfile = os.path.join(resultdir, renamed[i][-2:], '%s.txt' % renamed[i]) if checkToSkip(resultfile, overwrite): continue knn = searcher.search_knn(vectors[i], max_hits=max(3000,k*3)) if uniqueUser: removed, newknn = unique_user_constraint(knn, im2user, k) filtered += removed knn = newknn else: knn = knn[:k] assert(len(knn) >= k) writeRankingResults(knn, resultfile) done += 1 printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered)) start = end printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered))
testCollection = 'voc2008val' testAnnotationName = 'conceptsvoc2008val.txt' feature = 'dsift' modelName = 'fastlinear' modelName = 'fik50' metric = 'AP' scorer = getScorer(metric) if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model test_imset = readImageSet(testCollection, testCollection, rootpath=rootpath) test_feat_file = BigFile( os.path.join(rootpath, testCollection, 'FeatureData', feature)) test_renamed, test_vectors = test_feat_file.read(test_imset) concepts = readConcepts(testCollection, testAnnotationName, rootpath=rootpath) print('### %s' % os.path.join(trainCollection, 'Models', trainAnnotationName, feature, modelName)) results = [] for concept in concepts: model_file_name = os.path.join(rootpath, trainCollection, 'Models',
def process(options, testCollection, trainCollection, feature): rootpath = options.rootpath overwrite = options.overwrite tpp = options.tpp doRandomwalk = 1 #options.doRandomwalk uniqueUser = 0 #options.uniqueUser k = 1000 #options.k numjobs = options.numjobs job = options.job #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, # "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt") resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, '%s,tagrank,%s' % (feature, tpp), 'id.tagvotes.txt') if numjobs > 1: resultfile = resultfile + '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): sys.exit(0) try: doneset = set( [str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [ testImageSet[i] for i in range(len(testImageSet)) if (i % numjobs + 1) == job ] printStatus( INFO, 'working on %d-%d, %d test images -> %s' % (numjobs, job, len(testImageSet), resultfile)) testreader = TagReader(testCollection, rootpath=rootpath) test_feat_file = BigFile( os.path.join(rootpath, testCollection, 'FeatureData', feature)) block_size = 100 tagranking = TagRanking(trainCollection, feature=feature, k=k, rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 nr_of_blocks = len(testImageSet) / block_size if nr_of_blocks * block_size < len(testImageSet): nr_of_blocks += 1 for block_index in range(nr_of_blocks): start = block_index * block_size end = min(len(testImageSet), start + block_size) subset = testImageSet[start:end] if not subset: break renamed, features = test_feat_file.read(subset) printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset))) output = [] for i in range(len(renamed)): qry_id = renamed[i] qry_tags = testreader.get(qry_id) qry_vec = features[i] tagvotes = tagranking.estimate( qry_vec, qry_tags) #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk) newline = "%s %s" % (qry_id, " ".join( ["%s %g" % (x[0], x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] ))) fw.write("".join(output)) fw.flush() fw.close() printStatus(INFO, 'done')
rootpath = ROOT_PATH conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', trainAnnotationName) from basic.annotationtable import writeConceptsTo writeConceptsTo(test_tags, trainCollection, trainAnnotationName) cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % (parent_dir, trainCollection, conceptfile) os.system('python ' + cmd) train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) from util.simpleknn.bigfile import BigFile train_feat_file = BigFile(train_feat_dir) feat_dim = train_feat_file.ndims from basic.util import readImageSet test_imset = readImageSet(testCollection) test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature) test_feat_file = BigFile(test_feat_dir) #test_renamed, test_vectors = test_feat_file.read(test_imset) from model_based.dataengine.positiveengine import PositiveEngine from model_based.dataengine.negativeengine import NegativeEngine pe = PositiveEngine(trainCollection) ne = NegativeEngine(trainCollection) for tag in test_tags: pos_set = pe.sample(tag, 100) neg_set = ne.sample(tag, 100) names = pos_set + neg_set
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite laplaciankratio = options.laplaciankratio nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,k_ratio), 'tagmatrix.h5') if checkToSkip(resultfile, overwrite): return 0 tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus(INFO, 'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,laplaciankratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus(INFO, 'LaplacianI file not found in %s Did you run laplacian_images.py?' % (laplacianI_file)) sys.exit(1) tagmatrix_data = h5py.File(tagmatrix_file, 'r') tagmatrix = tagmatrix_data['tagmatrix'][:] printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape))) laplacian_data = scipy.io.loadmat(laplacianI_file) sigma = laplacian_data['sigma'] printStatus(INFO, 'Sigma^2 = %f' % (sigma)) workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10]) #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:]))) assert(np.all(workingSet == list(tagmatrix_data['id_images'][:]))) tot_images = len(workingSet) printStatus(INFO, '%d images in %s' % (tot_images, workingCollection)) printStatus(INFO, 'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0)))) K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio)) printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Starting the propagation pre-processing') tagmatrix_new = np.zeros(tagmatrix.shape) for i in xrange(tot_images): neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance) #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] C = np.sum(np.exp(-(NNDrow)/sigma)) tagmatrix_new[i,:] = np.sum((np.exp(-(NNDrow)/sigma).T * tagmatrix[NNrow]) / C, axis=0); if (i+1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i+1, tot_images)) # save output printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile) makedirsforfile(resultfile) fout = h5py.File(resultfile, 'w') fout['tagmatrix'] = tagmatrix_new fout['vocab'] = tagmatrix_data['vocab'][:] fout['id_images'] = workingSet fout.close()
render = web.template.render('templates/') pwd = os.path.dirname(os.path.realpath(__file__)) config = json.load(open(os.path.join(pwd, 'config.json'))) max_hits = config['max_hits'] rootpath = config['rootpath'] collection = config['collection'] rankMethod = config['rankMethod'] annotationName = config['annotationName'] metric = config['metric'] scorer = getScorer(metric) simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection, rankMethod) imset = readImageSet(collection, collection, rootpath) class index: def GET(self): input = web.input(query=None) resp = { 'status': 0, 'hits': 0, 'random': [], 'tagrel': [], 'metric': metric, 'perf': 0 } if input.query:
# shape file with open(new_shape_file, 'w') as fout: imA, featA = open(coll1_shape_file).read().strip().split(" ") imB, featB = open(coll2_shape_file).read().strip().split(" ") assert featA == featB fout.write('%d %d' % (int(imA) + int(imB), int(featA))) # copy and concatenate features file(new_features_file,'wb').write(file(coll1_features_file,'rb').read() + file(coll2_features_file,'rb').read()) # copy Annotations shutil.copytree("%s/%s/Annotations" % (datapath, coll1), "%s/%s+%s/Annotations" % (datapath, coll1, coll2)) # read ids testset_id_images = readImageSet(coll2, coll2, datapath) testset_id_images = set(map(int, testset_id_images)) train_id_images = readImageSet(coll1, coll1, datapath) train_id_images = set(map(int, train_id_images)) base_new_id = max(testset_id_images.union(train_id_images)) + 1 duplicates = testset_id_images.intersection(train_id_images) duplicates = dict([(x, x+base_new_id) for x in duplicates]) print "Found %d duplicates." % len(duplicates) # read id.txt coll1_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll1, feature) coll2_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll2, feature) new_featid_file = "%s/%s+%s/FeatureData/%s/id.txt" % (datapath, coll1, coll2, feature)
def process(options, testCollection, trainCollection, feature): rootpath = options.rootpath overwrite = options.overwrite tpp = options.tpp doRandomwalk = 1 #options.doRandomwalk uniqueUser = 0 #options.uniqueUser k = 1000 #options.k numjobs = options.numjobs job = options.job #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, # "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt") resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, '%s,tagrank,%s' % (feature,tpp), 'id.tagvotes.txt') if numjobs>1: resultfile = resultfile + '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): sys.exit(0) try: doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job] printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) ) testreader = TagReader(testCollection, rootpath=rootpath) test_feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) block_size = 100 tagranking = TagRanking(trainCollection, feature=feature, k=k, rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 nr_of_blocks = len(testImageSet) / block_size if nr_of_blocks * block_size < len(testImageSet): nr_of_blocks += 1 for block_index in range(nr_of_blocks): start = block_index * block_size end = min(len(testImageSet), start + block_size) subset = testImageSet[start:end] if not subset: break renamed, features = test_feat_file.read(subset) printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset))) output = [] for i in range(len(renamed)): qry_id = renamed[i] qry_tags = testreader.get(qry_id) qry_vec = features[i] tagvotes = tagranking.estimate(qry_vec, qry_tags) #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk) newline = "%s %s" % (qry_id, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] ))) fw.write("".join(output)) fw.flush() fw.close() printStatus(INFO, 'done')
def test_tagging(self): corpus = 'flickr4m' word2vec_model = 'tagvec500' testCollection = 'imagenet2hop-random2k' imset = readImageSet(testCollection, testCollection, rootpath) feature = 'dascaffeprob' feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) blocksize = 1000 scorers = [HitScorer(n) for n in [1, 2, 5, 10]] overwrite = 1 for embedding_model in str.split('conse conse2 hierse hierse2'): embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model) for synset_name in str.split('imagenet1k imagenet1k2hop'): if 'imagenet1k' == synset_name: label_file = 'data/ilsvrc12/synsets.txt' else: label_file = 'data/ilsvrc12/synsets2hop.txt' params = '%s %s --embedding %s --word2vec %s --corpus %s --overwrite %d' % (label_file, synset_name, embedding_model, word2vec_model, corpus, overwrite) os.system('python build_synset_vec.py %s' % params) shape_file = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name, 'shape.txt') self.assertTrue(os.path.exists(shape_file), msg="%s is not ready" % synset_name) synset_name = 'imagenet1k' label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name) i2v = Image2Vec(label_file, label2vec_dir) tagger = ZeroshotTagger(embedding_name = embedding_name) printStatus(INFO, 'tagging %d images' % len(imset)) start = 0 overall_perf = [0.0] * len(scorers) nr_of_images = 0 while start < len(imset): end = min(len(imset), start + blocksize) renamed, vectors = feat_file.read(imset[start:end]) for _id,_vec in zip(renamed, vectors): truth = set([_id.split('_')[0]]) im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec) sorted_labels = [int(x[0] in truth) for x in pred] perf = [scorer.score(sorted_labels) for scorer in scorers] overall_perf = [overall_perf[i] + perf[i] for i in range(len(scorers))] nr_of_images += 1 start = end res = [x/nr_of_images for x in overall_perf] print '_'*100 print embedding_name print ' '.join([x.name() for x in scorers]) print ' '.join(['%.3f' % x for x in res]) print '_'*100
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job blocksize = options.blocksize outputName = '%s,%s' % (feature,modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = BigFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0 test_time = 0 start = 0 done = 0 while start < nr_of_test_images: end = min(nr_of_test_images, start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed, test_X = feat_file.read(test_imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): if prob_output: scores = [models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts)] else: scores = [models[c].predict(test_X[i]) for c in range(nr_of_concepts)] #dec_value = sigmoid_predict(dec_value, A=pA, B=pB) tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True) output[i] = '%s %s\n' % (renamed[i], " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) test_time += time.time() - s_time start = end fw.write(''.join(output)) fw.flush() done += len(output) # done printStatus(INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time)) fw.close() return done
imB, featB = open(coll2_shape_file).read().strip().split(" ") assert featA == featB fout.write('%d %d' % (int(imA) + int(imB), int(featA))) # copy and concatenate features file(new_features_file, 'wb').write( file(coll1_features_file, 'rb').read() + file(coll2_features_file, 'rb').read()) # copy Annotations shutil.copytree("%s/%s/Annotations" % (datapath, coll1), "%s/%s+%s/Annotations" % (datapath, coll1, coll2)) # read ids testset_id_images = readImageSet(coll2, coll2, datapath) testset_id_images = set(map(int, testset_id_images)) train_id_images = readImageSet(coll1, coll1, datapath) train_id_images = set(map(int, train_id_images)) base_new_id = max(testset_id_images.union(train_id_images)) + 1 duplicates = testset_id_images.intersection(train_id_images) duplicates = dict([(x, x + base_new_id) for x in duplicates]) print "Found %d duplicates." % len(duplicates) # read id.txt coll1_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll1, feature) coll2_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll2, feature) new_featid_file = "%s/%s+%s/FeatureData/%s/id.txt" % (datapath, coll1, coll2,
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite laplaciankratio = options.laplaciankratio nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f' % (feature, nnName, k_ratio), 'tagmatrix.h5') if checkToSkip(resultfile, overwrite): return 0 tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus( INFO, 'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join( rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, laplaciankratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus( INFO, 'LaplacianI file not found in %s Did you run laplacian_images.py?' % (laplacianI_file)) sys.exit(1) tagmatrix_data = h5py.File(tagmatrix_file, 'r') tagmatrix = tagmatrix_data['tagmatrix'][:] printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape))) laplacian_data = scipy.io.loadmat(laplacianI_file) sigma = laplacian_data['sigma'] printStatus(INFO, 'Sigma^2 = %f' % (sigma)) workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10]) #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:]))) assert (np.all(workingSet == list(tagmatrix_data['id_images'][:]))) tot_images = len(workingSet) printStatus(INFO, '%d images in %s' % (tot_images, workingCollection)) printStatus(INFO, 'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0)))) K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio)) printStatus( INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Starting the propagation pre-processing') tagmatrix_new = np.zeros(tagmatrix.shape) for i in xrange(tot_images): neighbors = _get_neighbors( '%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance) #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] C = np.sum(np.exp(-(NNDrow) / sigma)) tagmatrix_new[i, :] = np.sum( (np.exp(-(NNDrow) / sigma).T * tagmatrix[NNrow]) / C, axis=0) if (i + 1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i + 1, tot_images)) # save output printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile) makedirsforfile(resultfile) fout = h5py.File(resultfile, 'w') fout['tagmatrix'] = tagmatrix_new fout['vocab'] = tagmatrix_data['vocab'][:] fout['id_images'] = workingSet fout.close()
def process(options, workingCollection, annotationName, feature, outputpkl): rootpath = options.rootpath distance = options.distance overwrite = options.overwrite k_ratio = options.kratio ratio_cs = options.ratiocs lambda1 = options.lambda1 lambda2 = options.lambda2 outputonlytest = options.outputonlytest rawtagmatrix = options.rawtagmatrix modelName = "robustpca" nnName = distance + "knn" printStatus( INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2)) if rawtagmatrix: printStatus(INFO, "Using raw tag matrix.") else: printStatus(INFO, "Using preprocessed tag matrix.") resultfile = os.path.join(outputpkl) resultfile_robustpca = os.path.join( rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d' % (feature, nnName, lambda1, lambda2, k_ratio, rawtagmatrix), 'prediction.mat') if checkToSkip(resultfile_robustpca, overwrite): only_dump = True else: only_dump = False if not rawtagmatrix: tagmatrix_file = os.path.join( rootpath, workingCollection, 'RobustPCA', '%s,%s,%f' % (feature, nnName, DEFAULT_K_PROP), 'tagmatrix.h5') if not os.path.exists(tagmatrix_file): printStatus( INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file)) sys.exit(1) else: tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus( INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, k_ratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus( INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file)) sys.exit(1) laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f' % (ratio_cs), 'laplacianT.mat') if not os.path.exists(laplacianT_file): printStatus( INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file)) sys.exit(1) # being learning script = """ rpca_path = 'transduction_based/robustpca/'; addpath(rpca_path); addpath([rpca_path, 'fast_svd/']); tagmatrix = sparse(double(h5read('%s', '/tagmatrix'))); load('%s'); load('%s'); lambda1 = %f; lambda2 = %f; maxIters = 50; precision = 1e-4; mu_start = 1.; parpool('local', 4); [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start); """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2) script += """ delete(gcp); save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3'); exit; """ % resultfile_robustpca if not only_dump: printStatus(INFO, "starting learning") makedirsforfile(resultfile_robustpca) call_matlab(script) if checkToSkip(resultfile, overwrite): return 0 # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(workingCollection, annotationName, rootpath) if outputonlytest: testset_id_images = readImageSet( workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() id_images = readImageSet(workingCollection, workingCollection, rootpath) id_images.sort() # concepts mapping robustpca_output = h5py.File(resultfile_robustpca, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts) predicted_tagmatrix = robustpca_output['P'][:, mapping] if outputonlytest: idx = np.array([bisect_index(id_images, x) for x in testset_id_images]) final_tagmatrix = predicted_tagmatrix[idx, :] assert (final_tagmatrix.shape[0] == idx.shape[0]) id_images = testset_id_images else: final_tagmatrix = predicted_tagmatrix makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': id_images, 'scores': final_tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
import sys, os, random from basic.common import ROOT_PATH from basic.util import readImageSet from simpleknn.bigfile import BigFile if __name__ == '__main__': rootpath = ROOT_PATH collection = sys.argv[1] feature = sys.argv[2] imset = readImageSet(collection, collection) feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature) feat_file = BigFile(feat_dir) imset = random.sample(imset, 50) #imset = imset[:5] renamed, vectors = feat_file.read(imset) for name, vec in zip(renamed, vectors): print name, sum(vec), sum(vec[:64]), sum(vec[64:])
def process(options, testCollection, trainCollection, annotationName, feature): rootpath = options.rootpath k = options.k distance = options.distance blocksize = options.blocksize donefile = options.donefile numjobs = options.numjobs job = options.job overwrite = options.overwrite taggerType = options.tagger noise = options.noise testset = options.testset if not testset: testset = testCollection modelName = taggerType if 'pretagvote' == taggerType and noise > 1e-3: modelName += '-noise%.2f' % noise if 'pqtagvote' == taggerType: nnName = "l2knn" else: nnName = distance + "knn" resultfile = os.path.join(rootpath, testCollection, 'autotagging', testset, trainCollection, annotationName, modelName, '%s,%s,%d' % (feature, nnName, k), 'id.tagvotes.txt') if numjobs > 1: resultfile += ".%d.%d" % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 if donefile: doneset = set([x.split()[0] for x in open(donefile) if x.strip()]) else: doneset = set() printStatus( INFO, "%d images have been done already, and they will be ignored" % len(doneset)) workingSet = readImageSet(testCollection, testset, rootpath) workingSet = [x for x in workingSet if x not in doneset] workingSet = [ workingSet[i] for i in range(len(workingSet)) if (i % numjobs + 1) == job ] test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature) test_feat_file = BigFile(test_feat_dir) tagger = NAME_TO_TAGGER[taggerType](trainCollection, annotationName, feature, distance, rootpath=rootpath) tagger.k = k tagger.noise = noise printStatus( INFO, "working on %d-%d, %d test images -> %s" % (numjobs, job, len(workingSet), resultfile)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0.0 test_time = 0.0 start = 0 done = 0 while start < len(workingSet): end = min(len(workingSet), start + blocksize) printStatus(INFO, 'tagging images from %d to %d' % (start, end - 1)) s_time = time.time() renamed, vectors = test_feat_file.read(workingSet[start:end]) nr_images = len(renamed) read_time += time.time() - s_time s_time = time.time() output = [None] * nr_images for i in range(nr_images): tagvotes = tagger.predict(content=vectors[i], context='%s,%s' % (testCollection, renamed[i])) output[i] = '%s %s\n' % (renamed[i], " ".join([ "%s %s" % (tag, niceNumber(vote, 6)) for (tag, vote) in tagvotes ])) test_time += time.time() - s_time start = end fw.write(''.join(output)) done += len(output) fw.close() printStatus( INFO, '%d images tagged, read time %g seconds, test time %g seconds' % (done, read_time, test_time))
if ( wn.morphy(t) is None or len(t) < 3 or not validateAnnotation(thing_synsets, wn.synsets(t)) ): # or not t in vocabulary_50: # del count_tags[t] print "N tags post wordnet filter: ", len(count_tags) vocab = count_tags.keys() # print count_tags return vocab, count_tags ############## workingSet = os.path.split(os.path.realpath(os.path.curdir))[1] id_images = readImageSet(workingSet, workingSet, ROOT_PATH) id_images.sort() # id_images = map(int, id_images) resultfile = os.path.join("TextData", "lemm_wordnet_freq_tags.h5") if os.path.exists(resultfile): print "File %s already exists. Aborting..." % resultfile sys.exit(1) tags_file = os.path.join("TextData", "id.userid.lemmtags.txt") if len(sys.argv) > 1: print "Getting vocabulary from %s" % sys.argv[1] otherCollection = h5py.File(sys.argv[1], "r") vocab = list(otherCollection["vocab"]) otherCollection.close() else:
from basic.annotationtable import writeConceptsTo writeConceptsTo(test_tags, trainCollection, trainAnnotationName) cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % ( parent_dir, trainCollection, conceptfile) os.system('python ' + cmd) train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) from util.simpleknn.bigfile import BigFile train_feat_file = BigFile(train_feat_dir) feat_dim = train_feat_file.ndims from basic.util import readImageSet test_imset = readImageSet(testCollection) test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature) test_feat_file = BigFile(test_feat_dir) #test_renamed, test_vectors = test_feat_file.read(test_imset) from model_based.dataengine.positiveengine import PositiveEngine from model_based.dataengine.negativeengine import NegativeEngine pe = PositiveEngine(trainCollection) ne = NegativeEngine(trainCollection) for tag in test_tags: pos_set = pe.sample(tag, 100) neg_set = ne.sample(tag, 100) names = pos_set + neg_set labels = [1] * len(pos_set) + [-1] * len(neg_set)
if __name__ == '__main__': rootpath = ROOT_PATH embedding_model = 'hierse2' embedding_name = 'flickr4m,tagvec500,%s' % embedding_model tagger = ZeroshotTagger(embedding_name = embedding_name) label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', 'imagenet1k', embedding_name) from im2vec import Image2Vec i2v = Image2Vec(label_file, label2vec_dir) from basic.util import readImageSet testCollection = 'imagenet2hop' imset = readImageSet(testCollection, 'random100k', rootpath) feature = 'dascaffeprob' feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) blocksize = 1000 start = 0 from eval import HitScorer scorers = [HitScorer(n) for n in [1, 2, 5, 10]] overall_perf = [0.0] * len(scorers) nr_of_images = 0 while start < len(imset): end = min(len(imset), start + blocksize) renamed, vectors = feat_file.read(imset[start:end])
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl): rootpath = options.rootpath k = options.k distance = options.distance variant = options.variant overwrite = options.overwrite testset = testCollection forcetrainmodel = options.trainmodel modelName = "tagprop" nnName = distance + "knn" printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature)) resultfile = os.path.join(outputpkl) resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat') if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite): return 0 tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5') if not os.path.exists(tagmatrix_file): printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file)) sys.exit(1) train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5') if not os.path.exists(train_neighs_file): printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file)) sys.exit(1) # do we need to perform learning? train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat') if os.path.exists(train_model_file) and not forcetrainmodel: printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file)) else: printStatus(INFO, "starting learning model for %s" % (trainCollection)) makedirsforfile(train_model_file) script = """ tagprop_path = 'model_based/tagprop/TagProp/'; addpath(tagprop_path); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NN = h5read('%s', '/NN'); NN = NN(2:end, :); NN = double(NN); """ % (tagmatrix_file, train_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NND = h5read('%s', '/NND'); NND = NND(2:end, :); NND = reshape(NND, 1, size(NND,1), size(NND,2)); NND = double(NND); """ % train_neighs_file if variant == 'rank': script += """ m = tagprop_learn(NN,[],tagmatrix); """ elif variant == 'ranksigmoids': script += """ m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true); """ elif variant == 'dist': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist'); """ elif variant == 'distsigmoids': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true); """ script += """ save('%s', 'm', '-v7.3'); exit; """ % train_model_file call_matlab(script) # we perform prediction printStatus(INFO, "starting prediction") test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5') if not os.path.exists(test_neighs_file): printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file)) sys.exit(1) script = """ tagprop_path = 'model_based/tagprop/TagProp/'; addpath(tagprop_path); load('%s'); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NNT = h5read('%s', '/NNT'); NNT = double(NNT); """ % (train_model_file, tagmatrix_file, test_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NNDT = h5read('%s', '/NNDT'); NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2)); NNDT = double(NNDT); """ % test_neighs_file script += """ P = tagprop_predict(NNT,[],m)'; save('%s', '-v7.3'); exit; """ % resultfile_tagprop makedirsforfile(resultfile_tagprop) call_matlab(script) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(testCollection, annotationName, rootpath) id_images = readImageSet(testCollection, testset, rootpath) id_images.sort() # id_images = map(int, id_images) # concepts mapping tagprop_output = h5py.File(resultfile_tagprop, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts) final_tagmatrix = tagprop_output['P'][:][:,mapping] with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job blocksize = options.blocksize outputName = '%s,%s' % (feature, modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs > 1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [ test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job ] nr_of_test_images = len(test_imset) printStatus( INFO, "working on %d-%d, %d test images -> %s" % (numjobs, job, nr_of_test_images, resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName, '%s.model' % concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = BigFile( os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0 test_time = 0 start = 0 done = 0 while start < nr_of_test_images: end = min(nr_of_test_images, start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end - 1)) s_time = time.time() renamed, test_X = feat_file.read(test_imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): if prob_output: scores = [ models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts) ] else: scores = [ models[c].predict(test_X[i]) for c in range(nr_of_concepts) ] #dec_value = sigmoid_predict(dec_value, A=pA, B=pB) tagvotes = sorted(zip(concepts, scores), key=lambda v: v[1], reverse=True) output[i] = '%s %s\n' % (renamed[i], " ".join([ "%s %s" % (tag, niceNumber(vote, 6)) for (tag, vote) in tagvotes ])) test_time += time.time() - s_time start = end fw.write(''.join(output)) fw.flush() done += len(output) # done printStatus( INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time)) fw.close() return done
for t in count_tags.copy(): if wn.morphy(t) is None or len(t) < 3 or not validateAnnotation( thing_synsets, wn.synsets(t)): #or not t in vocabulary_50: # del count_tags[t] print "N tags post wordnet filter: ", len(count_tags) vocab = count_tags.keys() #print count_tags return vocab, count_tags ############## workingSet = os.path.split(os.path.realpath(os.path.curdir))[1] id_images = readImageSet(workingSet, workingSet, ROOT_PATH) id_images.sort() #id_images = map(int, id_images) resultfile = os.path.join('TextData', "lemm_wordnet_freq_tags.h5") if os.path.exists(resultfile): print "File %s already exists. Aborting..." % resultfile sys.exit(1) tags_file = os.path.join('TextData', "id.userid.lemmtags.txt") if len(sys.argv) > 1: print "Getting vocabulary from %s" % sys.argv[1] otherCollection = h5py.File(sys.argv[1], 'r') vocab = list(otherCollection['vocab']) otherCollection.close() else:
import numpy as np import cPickle as pickle from basic.common import ROOT_PATH from basic.util import readImageSet, bisect_index, getVocabMap from basic.annotationtable import readConcepts tagmatrix_file = h5py.File(sys.argv[1], 'r') pkl_file = open(sys.argv[2], 'w') workingCollection = sys.argv[3] annotationName = sys.argv[4] rootpath = ROOT_PATH id_images = tagmatrix_file['id_images'] concepts = readConcepts(workingCollection, annotationName, rootpath) testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() if not type(id_images[0]) is str: id_images = map(str, id_images) if not type(testset_id_images[0]) is str: testset_id_images = map(str, testset_id_images) mapping = getVocabMap(list(tagmatrix_file['vocab'][:]),concepts) predicted_tagmatrix = tagmatrix_file['tagmatrix'][:,mapping] print "predicted_tagmatrix.shape = ", predicted_tagmatrix.shape print "len(id_images) = ", len(id_images) print "len(testset_id_images) = ", len(testset_id_images)
trainCollection = 'voc2008train' trainAnnotationName = 'conceptsvoc2008train.txt' testCollection = 'voc2008val' testset = testCollection testAnnotationName = 'conceptsvoc2008val.txt' modelName = 'fik50' #modelName = 'fastlinear' if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model scorer = getScorer(metric) imset = readImageSet(testCollection,testset,rootpath=rootpath) concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath) feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature) feat_file = BigFile(feat_dir) _renamed, _vectors = feat_file.read(imset) nr_of_images = len(_renamed) nr_of_concepts = len(concepts) mAP = 0.0 models = [None] * len(concepts) stream = StreamFile(feat_dir) for i,concept in enumerate(concepts):
nr_of_images_list = [] feat_dim_list = [] feat_files = [] for feature in srcfeatures: shapefile = os.path.join(rootpath, collection, "FeatureData", feature, "shape.txt") nr_of_images, feat_dim = map(int, open(shapefile).readline().strip().split()) nr_of_images_list.append(nr_of_images) feat_dim_list.append(feat_dim) feat_files.append(BigFile(os.path.join(rootpath, collection, "FeatureData", feature))) # assert(nr_of_images_list[0] == nr_of_images_list[1]) new_feat_dim = sum(feat_dim_list) imset = readImageSet(collection, collection, rootpath) nr_of_images = len(imset) blocksize = 1000 makedirsforfile(binary_file) fw = open(binary_file, "wb") new_imset = [] start = 0 while start < nr_of_images: end = min(nr_of_images, start + blocksize) printStatus(INFO, "processing images from %d to %d" % (start, end - 1)) renamed_0, vecs_0 = feat_files[0].read(imset[start:end]) renamed_1, vecs_1 = feat_files[1].read(imset[start:end])
def process(options, testCollection, trainCollection, annotationName, feature): rootpath = options.rootpath k = options.k distance = options.distance overwrite = options.overwrite testset = testCollection onlytest = options.onlytest nnName = distance + "knn" resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5') resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5') if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip(resultfile_test, overwrite): return 0 testSet = readImageSet(testCollection, testset, rootpath) trainSet = readImageSet(trainCollection, trainCollection, rootpath) testSet.sort() trainSet.sort() #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) #train_feat_file = BigFile(train_feat_dir) tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001) printStatus(INFO, '%d test images, %d train images' % (len(testSet), len(trainSet))) # allocate train -> train nearest neighbors if not onlytest: printStatus(INFO, 'Allocating NN, NND matrices') NN = np.zeros((len(trainSet), k+1), dtype=np.int32) NND = np.zeros((len(trainSet), k+1)) printStatus(INFO, 'Filling NN, NND matrices') for i,id_img in enumerate(trainSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img)) if len(neighbors) < k+1: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k+1)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NN[i,:] = NNrow[0:k+1] NND[i,:] = NNDrow[0:k+1] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(trainSet))) printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train)) makedirsforfile(resultfile_train) fout = h5py.File(resultfile_train, 'w') fout['NN'] = NN fout['NND'] = NND fout['trainSet'] = trainSet fout['concepts'] = tagger.concepts fout.close() del NN del NND # allocate test -> train nearest neighbors printStatus(INFO, 'Allocating NNT, NNDT matrices') NNT = np.zeros((len(testSet), k), dtype=np.int32) NNDT = np.zeros((len(testSet), k)) printStatus(INFO, 'Filling NNT, NNDT matrices') for i,id_img in enumerate(testSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img)) if len(neighbors) < k: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NNT[i,:] = NNrow[0:k] NNDT[i,:] = NNDrow[0:k] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(testSet))) printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test)) makedirsforfile(resultfile_test) fout = h5py.File(resultfile_test, 'w') fout['NNT'] = NNT fout['NNDT'] = NNDT fout['trainSet'] = trainSet fout['testSet'] = testSet fout['concepts'] = tagger.concepts fout.close()
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): assert (modelName.startswith('fastlinear')) rootpath = options.rootpath overwrite = options.overwrite numjobs = options.numjobs job = options.job topk = options.topk outputName = '%s,%s' % (feature, modelName) resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs > 1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [ test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job ] test_imset = set(test_imset) nr_of_test_images = len(test_imset) printStatus( INFO, "working on %d-%d, %d test images -> %s" % (numjobs, job, nr_of_test_images, resultfile)) ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath) feat_file = StreamFile( os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 feat_file.open() for _id, _vec in feat_file: if _id not in test_imset: continue res = ma.predict([_vec], prob=0) tagvotes = res[0] if topk > 0: tagvotes = tagvotes[:topk] newline = '%s %s\n' % (_id, " ".join( ["%s %s" % (tag, niceNumber(vote, 6)) for (tag, vote) in tagvotes])) fw.write(newline) done += 1 if done % 1e4 == 0: printStatus(INFO, "%d done" % done) feat_file.close() fw.close() printStatus(INFO, "%d done" % (done)) return done