def process(options, source_dir, feat_dim, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) featurefile = BigFile(source_dir, feat_dim) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) renamed, vectors = featurefile.read(imset[start:end]) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) resultfile = os.path.join(result_dir, 'id.txt') fw = open(resultfile, 'w') fw.write(' '.join(done)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
class Synset2Vec: def __init__(self, corpus=DEFAULT_W2V_CORPUS, w2v_name=DEFAULT_W2V, wnid2words_file=DEFAULT_WNID2WORDS_FILE, rootpath=ROOT_PATH): word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', w2v_name) self.word2vec = BigFile(word2vec_dir) self.wnid2words = pickle.load(open(wnid2words_file, 'rb')) logger.info('w2v(%s): %d words, %d dims', corpus, self.word2vec.shape()[0], self.get_feat_dim()) def get_feat_dim(self): return self.word2vec.ndims def explain(self, wnid): return self.wnid2words[wnid] def _mapping(self, query_wnid): words = self.wnid2words[query_wnid].lower() words = [w.strip().replace(' ', '_') for w in words.split(',')] words = [w.replace('-', '_') for w in words] for w in words: vec = self.word2vec.read_one(w) if vec: return vec return None def embedding(self, wnid): return self._mapping(wnid)
def __init__(self, Y0=DEFAULT_Y0, label_vec_name=DEFAULT_LABEL_VEC_NAME, rootpath=ROOT_PATH): label_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/synsets_%s.txt' % Y0) label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, label_vec_name) self.labels = map(str.strip, open(label_file).readlines()) self.nr_of_labels = len(self.labels) feat_file = BigFile(label2vec_dir) renamed, vectors = feat_file.read(self.labels) name2index = dict(zip(renamed, range(len(renamed)))) self.label_vectors = [None] * self.nr_of_labels self.feat_dim = feat_file.ndims for i in xrange(self.nr_of_labels): idx = name2index.get(self.labels[i], -1) self.label_vectors[i] = np.array( vectors[idx]) if idx >= 0 else None nr_of_inactive_labels = len( [x for x in self.label_vectors if x is None]) logger.info('#active_labels=%d, embedding_size=%d', self.nr_of_labels - nr_of_inactive_labels, self.feat_dim)
def process(options, testCollection): overwrite = options.overwrite rootpath = options.rootpath Y0 = options.Y0 Y1 = options.Y1 pY0 = options.pY0 r = options.r batch_size = 2000 label_vec_name = '%s,%s,%s' % (options.w2v_corpus, options.w2v, options.embedding) for synset_name in [Y0, Y1]: assert (os.path.exists( os.path.join(rootpath, 'synset2vec', synset_name, label_vec_name))) resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, pY0, label_vec_name, 'id.tagvotes.txt') if os.path.exists(resfile) and not overwrite: logger.info('%s exists. quit', resfile) return 0 i2v = Image2Vec(Y0=Y0, label_vec_name=label_vec_name) tagger = ZeroshotTagger(Y1=Y1, label_vec_name=label_vec_name, rootpath=rootpath) imset = utility.readImageSet(testCollection, testCollection, rootpath) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0) feat_file = BigFile(feat_dir) logger.info('tagging %d images', len(imset)) utility.makedirsforfile(resfile) logger.info('save results to %s', resfile) fw = open(resfile, 'w') start = 0 while start < len(imset): end = min(len(imset), start + batch_size) logger.info('processing images from %d to %d', start, end) todo = imset[start:end] if not todo: break renamed, vectors = feat_file.read(todo) output = [] for _id, _vec in zip(renamed, vectors): im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec, topk=options.r) output.append( '%s %s\n' % (_id, ' '.join(['%s %s' % (x[0], x[1]) for x in pred]))) start = end fw.write(''.join(output)) fw.close()
def get_we_parameter(vocabulary, word2vec_file): print('getting inital word embedding ...') w2v_reader = BigFile(word2vec_file) ndims = w2v_reader.ndims #print("word embedding dim ", ndims) #print(vocabulary) #sys.exit() we = [] # Reserve 0 for masking via pad_sequences we.append(np.array([0]*ndims)) fail_counter = 0 for word in vocabulary: word = word.strip() try: vec = w2v_reader.read_one(word) vec = np.array(vec) assert vec.shape == (500,) we.append(vec) except Exception as e: # print word vec = np.random.uniform(-1,1,ndims) #print(vec.shape) we.append(vec) fail_counter +=1 print("%d words out of %d words cannot find pre-trained word2vec vector" % (fail_counter, len(vocabulary))) return np.array(we)
class Synset2Vec: def __init__(self, corpus=DEFAULT_W2V_CORPUS, w2v_name=DEFAULT_W2V, wnid2words_file=DEFAULT_WNID2WORDS_FILE, rootpath=ROOT_PATH): word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', w2v_name) self.word2vec = BigFile(word2vec_dir) self.wnid2words = pickle.load(open(wnid2words_file, 'rb')) logger.info('w2v(%s): %d words, %d dims', corpus, self.word2vec.shape()[0], self.get_feat_dim()) def get_feat_dim(self): return self.word2vec.ndims def explain(self, wnid): return self.wnid2words[wnid] def _mapping(self, query_wnid): words = self.wnid2words[query_wnid].lower() words = [w.strip().replace(' ','_') for w in words.split(',')] words = [w.replace('-', '_') for w in words] for w in words: vec = self.word2vec.read_one(w) if vec: return vec return None def embedding(self, wnid): return self._mapping(wnid)
def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0): Text2Vec.__init__(self, datafile, ndims, L1_normalize, L2_normalize) self.word2vec = BigFile(datafile) if ndims != 0: assert self.word2vec.ndims == self.ndims, "feat dimension is not match %d != %d" % ( self.word2vec.ndims, self.ndims) else: self.ndims = self.word2vec.ndims
def __init__(self, corpus, modelName, wnid2words_file='data/wnid2words.pkl', rootpath=ROOT_PATH): printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...') word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName) self.wnid2words = pickle.load(open(wnid2words_file, 'rb')) self.word2vec = BigFile(word2vec_dir)
def process(options, image_collection, pY0): rootpath = options.rootpath overwrite = options.overwrite k = options.k batch_size = options.batch_size subset = options.subset if options.subset else image_collection Y0 = options.Y0 label_vec_name = options.label_vec_name new_feature = '%s,%s,%s' % (Y0, label_vec_name, pY0) resfile = os.path.join(rootpath, image_collection, 'FeatureData', new_feature, 'id.feature.txt') if os.path.exists(resfile) and not overwrite: logger.info('%s exists. quit', resfile) return 0 imsetfile = os.path.join(rootpath, image_collection, 'ImageSets', '%s.txt' % subset) imset = map(str.strip, open(imsetfile).readlines()) logger.info('%d images to do', len(imset)) feat_file = BigFile( os.path.join(rootpath, image_collection, 'FeatureData', pY0)) im2vec = Image2Vec(Y0, label_vec_name, rootpath) utility.makedirsforfile(resfile) fw = open(resfile, 'w') read_time = 0 run_time = 0 start = 0 done = 0 while start < len(imset): end = min(len(imset), start + batch_size) logger.info('processing images from %d to %d', start, end - 1) s_time = time.time() renamed, test_X = feat_file.read(imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): vec = im2vec.embedding(test_X[i], k) output[i] = '%s %s\n' % (renamed[i], " ".join(map(str, vec))) run_time += time.time() - s_time start = end fw.write(''.join(output)) done += len(output) # done fw.close() logger.info("%d done. read time %g seconds, run_time %g seconds", done, read_time, run_time) return done
def __init__(self, corpus=DEFAULT_W2V_CORPUS, w2v_name=DEFAULT_W2V, wnid2words_file=DEFAULT_WNID2WORDS_FILE, rootpath=ROOT_PATH): word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', w2v_name) self.word2vec = BigFile(word2vec_dir) self.wnid2words = pickle.load(open(wnid2words_file, 'rb')) logger.info('w2v(%s): %d words, %d dims', corpus, self.word2vec.shape()[0], self.get_feat_dim())
def __init__(self, label_source, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH): label_vec_path = os.path.join('data', label_source, 'label_vec') label_id_file = os.path.join('data', label_source, 'label.txt') self.im2vec = Image2Vec(label_id_file, label_vec_path) self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath) self.img_feats = BigFile(feat_path)
def __init__(self, db_file): self.nr_of_sents, self.feat_dim = map(int, open(self.shape_file).readline().split()) self.sent_pool = map(str.strip, open(self.sent_file).readlines()) self.sent_searcher = load_model(os.path.join(self.sent_feat_dir, 'feature.bin'), self.feat_dim, self.nr_of_sents, self.sent_id_file) self.sent_searcher.set_distance('cosine') feat_dir = os.path.join(self.rootpath, self.img_collection, "FeatureData", self.vis_feat) self.vis_feat_file = BigFile(feat_dir) imageSetFile = open(os.path.join(self.rootpath, self.img_collection, "ImageSets", "%s.txt"%self.img_collection), 'r') self.imageSet = imageSetFile.readlines() self.db_file = db_file
def process(options, testCollection): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding_model = options.embedding Y0 = options.Y0 Y1 = options.Y1 pY0 = options.pY0 r = options.r blocksize = 2000 embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model) for synset_name in [Y0, Y1]: assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name))) resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt') if checkToSkip(resfile, overwrite): return 0 label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name) i2v = Image2Vec(label_file, label2vec_dir) tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath) imset = readImageSet(testCollection, testCollection, rootpath) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0) feat_file = BigFile(feat_dir) printStatus(INFO, 'tagging %d images' % len(imset)) makedirsforfile(resfile) fw = open(resfile, 'w') start = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end)) todo = imset[start:end] if not todo: break renamed, vectors = feat_file.read(todo) output = [] for _id,_vec in zip(renamed, vectors): im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec, topk=options.r) output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred]))) start = end fw.write(''.join(output)) fw.close()
def process(options, testCollection): overwrite = options.overwrite rootpath = options.rootpath Y0 = options.Y0 Y1 = options.Y1 pY0 = options.pY0 r = options.r batch_size = 2000 label_vec_name = '%s,%s,%s' % (options.w2v_corpus, options.w2v, options.embedding) for synset_name in [Y0, Y1]: assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, label_vec_name))) resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, pY0, label_vec_name, 'id.tagvotes.txt') if os.path.exists(resfile) and not overwrite: logger.info('%s exists. quit', resfile) return 0 i2v = Image2Vec(Y0=Y0, label_vec_name=label_vec_name) tagger = ZeroshotTagger(Y1=Y1, label_vec_name=label_vec_name, rootpath=rootpath) imset = utility.readImageSet(testCollection, testCollection, rootpath) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0) feat_file = BigFile(feat_dir) logger.info('tagging %d images', len(imset)) utility.makedirsforfile(resfile) logger.info('save results to %s', resfile) fw = open(resfile, 'w') start = 0 while start < len(imset): end = min(len(imset), start + batch_size) logger.info('processing images from %d to %d', start, end) todo = imset[start:end] if not todo: break renamed, vectors = feat_file.read(todo) output = [] for _id,_vec in zip(renamed, vectors): im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec, topk=options.r) output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred]))) start = end fw.write(''.join(output)) fw.close()
def __init__(self, model_path, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH): self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath) self.img_feats = BigFile(feat_path) print model_path devise_model = cPickle.load(open(model_path, 'rb')) words_vec = T.matrix(dtype=theano.config.floatX) img_vec = T.matrix(dtype=theano.config.floatX) # compile a predictor function self.predict_model = theano.function( inputs=[words_vec, img_vec], outputs=devise_model.predict_score_one2many(words_vec, img_vec), allow_input_downcast=True)
def process(options, label_file, label2vec_dir, testCollection, feature, new_feature): rootpath = options.rootpath overwrite = options.overwrite k = options.k blocksize = options.blocksize subset = options.subset if options.subset else testCollection resfile = os.path.join(rootpath, testCollection, 'FeatureData', new_feature, 'id.feature.txt') if checkToSkip(resfile, overwrite): return 0 imsetfile = os.path.join(rootpath, testCollection, 'ImageSets', '%s.txt' % subset) imset = map(str.strip, open(imsetfile).readlines()) printStatus(INFO, '%d images to do' % len(imset)) feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) im2vec = Image2Vec(label_file, label2vec_dir) makedirsforfile(resfile) fw = open(resfile, 'w') read_time = 0 run_time = 0 start = 0 done = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed, test_X = feat_file.read(imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): vec = im2vec.embedding(test_X[i], k) output[i] = '%s %s\n' % (renamed[i], " ".join([niceNumber(x,6) for x in vec])) run_time += time.time() - s_time start = end fw.write(''.join(output)) done += len(output) # done printStatus(INFO, "%d done. read time %g seconds, run_time %g seconds" % (done, read_time, run_time)) fw.close() return done
def __init__(self, label_file, label2vec_dir): self.labels = map(str.strip, open(label_file).readlines()) self.nr_of_labels = len(self.labels) feat_file = BigFile(label2vec_dir) renamed, vectors = feat_file.read(self.labels) name2index = dict(zip(renamed, range(len(renamed)))) self.label_vectors = [None] * self.nr_of_labels self.feat_dim = feat_file.ndims for i in xrange(self.nr_of_labels): idx = name2index.get(self.labels[i], -1) self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None nr_of_inactive_labels = len([x for x in self.label_vectors if x is None]) printStatus(INFO, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
def __init__(self, model_path, bow_path, feat_path): # voabulary_file = os.path.join('result', 'msr2013train_voabulary_query_bow.pkl') self.count_vect, self.tf_transformer = cPickle.load(open(bow_path, 'rb')) self.img_feats = BigFile(feat_path) # print model_path devise_model = cPickle.load(open(model_path, 'rb')) # words_vec = T.matrix(dtype=theano.config.floatX) words_vec = sparse.csr_matrix(dtype=theano.config.floatX) img_vec = T.matrix(dtype=theano.config.floatX) # compile a predictor function self.predict_model = theano.function( inputs=[words_vec, img_vec], outputs=devise_model.predict_score_one2many(words_vec, img_vec), allow_input_downcast=True)
class Synset2Vec: def __init__(self, corpus, modelName, wnid2words_file='data/wnid2words.pkl', rootpath=ROOT_PATH): printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...') word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName) self.wnid2words = pickle.load(open(wnid2words_file, 'rb')) self.word2vec = BigFile(word2vec_dir) def get_feat_dim(self): return self.word2vec.ndims def explain(self, wnid): return self.wnid2words[wnid] def mapping(self, query_wnid): words = self.wnid2words[query_wnid].lower() words = [w.strip().replace(' ','_') for w in words.split(',')] words = [w.replace('-', '_') for w in words] for w in words: renamed, vectors = self.word2vec.read([w]) if vectors: return vectors[0] return None def embedding(self, wnid): return self.mapping(wnid)
class Query2Vec: def __init__(self, corpus, modelName, rootpath=ROOT_PATH): word2vec_dir = os.path.join(rootpath, "word2vec", corpus, modelName) self.word2vec = BigFile(word2vec_dir) def get_feat_dim(self): return self.word2vec.ndims def mapping(self, query): #words = query.lower().split(',') words = query.lower().split('/') res = [] for word in words: res += word.strip().replace('_', ' ').split() word_vecs = [] newname = [] for w in res: renamed, vectors = self.word2vec.read([w]) if vectors: word_vecs.append(vectors[0]) newname.append(renamed[0]) #print wnid, res, len(word_vecs) if len(word_vecs)>0: return np.array(word_vecs).mean(axis=0) else: return None def embedding(self, wnid): return self.mapping(wnid)
class Query2Vec: def __init__(self, corpus, modelName, rootpath=ROOT_PATH): word2vec_dir = os.path.join(rootpath, "word2vec", corpus, modelName) self.word2vec = BigFile(word2vec_dir) def get_feat_dim(self): return self.word2vec.ndims def mapping(self, query): #words = query.lower().split(',') words = query.lower().split('/') res = [] for word in words: res += word.strip().replace('_', ' ').split() word_vecs = [] newname = [] for w in res: renamed, vectors = self.word2vec.read([w]) if vectors: word_vecs.append(vectors[0]) newname.append(renamed[0]) #print wnid, res, len(word_vecs) if len(word_vecs) > 0: return np.array(word_vecs).mean(axis=0) else: return None def embedding(self, wnid): return self.mapping(wnid)
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature): rootpath = options.rootpath modelName = options.model if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = readConcepts(trainCollection, trainAnnotationName, rootpath) concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job] feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature)) for concept in concepts: modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept) model = load_model(modelfile) (A0, B0) = model.get_probAB() if abs(A0) > 1e-8 and not options.overwrite: printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0)) continue names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) results = classify_large_data(model, names, feat_file, prob_output=False) labels = [name2label[x[0]] for x in results] dec_values = [x[1] for x in results] printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1]))) [A,B] = sigmoid_train(dec_values, labels) model.set_probAB(A, B) save_model(modelfile, model) (A1, B1) = model.get_probAB() printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
class SemanticEmbedding: def __init__(self, label_source, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH): label_vec_path = os.path.join('data', label_source, 'label_vec') label_id_file = os.path.join('data', label_source, 'label.txt') self.im2vec = Image2Vec(label_id_file, label_vec_path) self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath) self.img_feats = BigFile(feat_path) def do_search(self, query, iid_list, k): # convert query to vector qvec = self.qry2vec.embedding(query) if qvec is not None: renamed, test_X = self.img_feats.read(iid_list) imgvecs = [] for iid in iid_list: img_label = test_X[renamed.index(iid)] imgvecs.append(self.im2vec.embedding(img_label, k)) scorelist = calImageSimiByCos(qvec, imgvecs) else: scorelist = [] return scorelist
class Synset2Vec: def __init__(self, corpus, modelName, wnid2words_file='data/wnid2words.pkl', rootpath=ROOT_PATH): printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...') word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName) self.wnid2words = pickle.load(open(wnid2words_file, 'rb')) self.word2vec = BigFile(word2vec_dir) def get_feat_dim(self): return self.word2vec.ndims def explain(self, wnid): return self.wnid2words[wnid] def mapping(self, query_wnid): words = self.wnid2words[query_wnid].lower() words = [w.strip().replace(' ', '_') for w in words.split(',')] words = [w.replace('-', '_') for w in words] for w in words: renamed, vectors = self.word2vec.read([w]) if vectors: return vectors[0] return None def embedding(self, wnid): return self.mapping(wnid)
class ConSE: def __init__(self, label_source, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH): label_vec_path = os.path.join('data', label_source, 'label_vec') label_id_file = os.path.join('data', label_source, 'label.txt') self.im2vec = Image2Vec(label_id_file, label_vec_path) self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath) self.img_feats = BigFile(feat_path) def do_search(self, query, iid_list, k): # convert query to vector qvec = self.qry2vec.embedding(query) if qvec is not None: renamed, test_X = self.img_feats.read(iid_list) imgvecs = [] for iid in iid_list: img_label = test_X[renamed.index(iid)] imgvecs.append(self.im2vec.embedding(img_label, k)) scorelist = calImageSimiByCos(qvec, imgvecs) else: scorelist = [] return scorelist
def __init__(self, input_json_path, vocab, vf_dir, use_att=False, eng_gt_file=None, rootpath=rootpath): print input_json_path with open(input_json_path) as f: data = json.load(f) self.eng_gt_file = eng_gt_file self.imgname2enggt = {} if self.eng_gt_file is not None: assert os.path.exists(self.eng_gt_file), "Eng gt file not exist: %s"%eng_gt_file print ('Loading eng gt file') eng_data = json.load(open(self.eng_gt_file)) for x in eng_data['images']: img_filename = x['filename'] sents=[] for y in x['sentences']: sents.append(' '.join(y['tokens'])) self.imgname2enggt[img_filename] = sents self.images = data['images'] self.vocab = vocab self.sentences = {} self.img2sents = {} self.img2enggt = {} self.img2filename = {} self.sentId2imgId = {} self.imgIds = [] self.sentIds = [] for img in self.images: img_id = img['imgid'] self.img2filename[img_id] = img['filename'].split('.')[0] self.imgIds.append(img_id) self.img2sents[img_id] = img['sentids'] self.img2enggt[img_id] = self.imgname2enggt.get(img['filename'], []) for i, sent in enumerate(img['sentences']): self.sentences[sent['sentid']] = (sent['tokens'], sent['raw']) self.sentIds.append(sent['sentid']) self.sentId2imgId[sent['sentid']] = img_id sid = img['filename'].split('.')[0]+'#'+str(i) self.use_att = use_att if self.use_att == True: self.vf_dir = vf_dir else: self.vf_dir = vf_dir self.vf_reader = BigFile(vf_dir)
def __init__(self, Y1=DEFAULT_Y1, label_vec_name=DEFAULT_LABEL_VEC_NAME, rootpath=ROOT_PATH): feat_dir = os.path.join(rootpath, 'synset2vec', Y1, label_vec_name) feat_file = BigFile(feat_dir) self.labels = feat_file.names self.nr_of_labels = len(self.labels) self.feat_dim = feat_file.ndims renamed, vectors = feat_file.read(self.labels) name2index = dict(zip(renamed, range(len(renamed)))) self.label_vectors = [None] * self.nr_of_labels for i in xrange(self.nr_of_labels): idx = name2index.get(self.labels[i], -1) self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None nr_of_inactive_labels = len([x for x in self.label_vectors if x is None]) logger.info('#active_labels=%d, embedding_size=%d', self.nr_of_labels - nr_of_inactive_labels, self.feat_dim)
def __init__(self, synset_name='imagenet1k2hop', embedding_name='flickr4m,tagvec500,hierse2', rootpath=ROOT_PATH): feat_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name) feat_file = BigFile(feat_dir) self.labels = feat_file.names self.nr_of_labels = len(self.labels) self.feat_dim = feat_file.ndims renamed, vectors = feat_file.read(self.labels) name2index = dict(zip(renamed, range(len(renamed)))) self.label_vectors = [None] * self.nr_of_labels for i in xrange(self.nr_of_labels): idx = name2index.get(self.labels[i], -1) self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None nr_of_inactive_labels = len([x for x in self.label_vectors if x is None]) printStatus(INFO + '.' + self.__class__.__name__, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
def __init__(self, datafile, ndims=0, language='en', L1_normalize=0, L2_normalize=0): Text2Vec.__init__(self, datafile, ndims, language, L1_normalize, L2_normalize) self.word2vec = BigFile( datafile) if language == 'en' else w2v.Word2Vec.load(datafile) if ndims != 0: if 'en' == language: assert self.word2vec.ndims == self.ndims, "feat dimension is not match %d != %d" % ( self.word2vec.ndims, self.ndims) else: print 'ndims #', ndims else: self.ndims = self.word2vec.ndims if 'en' == language else 500
def get_en_we_parameter(vocabulary, word2vec_file): print 'getting inital word embedding ...' w2v_reader = BigFile(word2vec_file) ndims = w2v_reader.ndims fail_counter = 0 we = [] # Reserve 0 for masking via pad_sequences we.append([0]*ndims) for word in vocabulary: word = word.strip() try: vec = w2v_reader.read_one(word) # print vec we.append(vec) except Exception, e: vec = np.random.uniform(-1,1,ndims) we.append(vec) fail_counter +=1
def __init__(self, Y1=DEFAULT_Y1, label_vec_name=DEFAULT_LABEL_VEC_NAME, rootpath=ROOT_PATH): feat_dir = os.path.join(rootpath, 'synset2vec', Y1, label_vec_name) feat_file = BigFile(feat_dir) self.labels = feat_file.names self.nr_of_labels = len(self.labels) self.feat_dim = feat_file.ndims renamed, vectors = feat_file.read(self.labels) name2index = dict(zip(renamed, range(len(renamed)))) self.label_vectors = [None] * self.nr_of_labels for i in xrange(self.nr_of_labels): idx = name2index.get(self.labels[i], -1) self.label_vectors[i] = np.array( vectors[idx]) if idx >= 0 else None nr_of_inactive_labels = len( [x for x in self.label_vectors if x is None]) logger.info('#active_labels=%d, embedding_size=%d', self.nr_of_labels - nr_of_inactive_labels, self.feat_dim)
class ImageSimer: def __init__(self, dev_feat_path, train_feat_path): self.dev_feats = BigFile(dev_feat_path) self.train_feats = BigFile(train_feat_path) def calsimImage(self, img, imgs): imgfeat = self.dev_feats.read_one(img) renamed, test_X = self.train_feats.read(imgs) resorted_feats = [None] * len(renamed) for i in xrange(len(renamed)): idx = imgs.index(renamed[i]) resorted_feats[idx] = test_X[i] return calImageSimiByCos( imgfeat, resorted_feats) def calsimiImagewithClick(self, img, img_click_list, clickthres): imgfeat = self.dev_feats.read_one(img) img_list = [x[0] for x in img_click_list if int(x[1]) >= clickthres] clc_list = [int(x[1]) for x in img_click_list if int(x[1]) >= clickthres] assert (len(img_list) == len(clc_list)) renamed, test_X = self.train_feats.read(img_list) # re-sort the label list according to the renamed resorted_feats = [None] * len(renamed) for i in xrange(len(renamed)): idx = img_list.index(renamed[i]) resorted_feats[idx] = test_X[i] img_simi = calImageSimiByCos( imgfeat, resorted_feats) return sum(np.array(img_simi) * np.log(clc_list)) / len(img_list)
class ImageSimer: def __init__(self, dev_feat_path, train_feat_path): self.dev_feats = BigFile(dev_feat_path) self.train_feats = BigFile(train_feat_path) def calsimImage(self, img, imgs): imgfeat = self.dev_feats.read_one(img) renamed, test_X = self.train_feats.read(imgs) resorted_feats = [None] * len(renamed) for i in xrange(len(renamed)): idx = imgs.index(renamed[i]) resorted_feats[idx] = test_X[i] return calImageSimiByCos(imgfeat, resorted_feats) def calsimiImagewithClick(self, img, img_click_list, clickthres): imgfeat = self.dev_feats.read_one(img) img_list = [x[0] for x in img_click_list if int(x[1]) >= clickthres] clc_list = [ int(x[1]) for x in img_click_list if int(x[1]) >= clickthres ] assert (len(img_list) == len(clc_list)) renamed, test_X = self.train_feats.read(img_list) # re-sort the label list according to the renamed resorted_feats = [None] * len(renamed) for i in xrange(len(renamed)): idx = img_list.index(renamed[i]) resorted_feats[idx] = test_X[i] img_simi = calImageSimiByCos(imgfeat, resorted_feats) return sum(np.array(img_simi) * np.log(clc_list)) / len(img_list)
class Devise_pre(object): def __init__(self, model_path, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH): self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath) self.img_feats = BigFile(feat_path) print model_path devise_model = cPickle.load(open(model_path, 'rb')) words_vec = T.matrix(dtype=theano.config.floatX) img_vec = T.matrix(dtype=theano.config.floatX) # compile a predictor function self.predict_model = theano.function( inputs=[words_vec, img_vec], outputs=devise_model.predict_score_one2many(words_vec, img_vec), allow_input_downcast=True) def predict_score(self, query, iid_list, normalization = 'L2'): qvec = self.qry2vec.embedding(query) if qvec is not None: # L2 normalization if normalization == 'L2': qvec = qvec / LA.norm(qvec,2) renamed, test_X = self.img_feats.read(iid_list) X = [] for iid in iid_list: img_label = test_X[renamed.index(iid)] X.append(img_label) query_array = np.array([qvec]) image_array = np.array(X) scorelist = self.predict_model(query_array, image_array)[0].tolist() # scorelist = np.reshape(temp,(1,-1))[0].tolist() else: scorelist = [] return scorelist
class AveWord2Vec(Text2Vec): # datafile: the path of pre-trained word2vec data def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0): Text2Vec.__init__(self, datafile, ndims, L1_normalize, L2_normalize) self.word2vec = BigFile(datafile) if ndims != 0: assert self.word2vec.ndims == self.ndims, "feat dimension is not match %d != %d" % ( self.word2vec.ndims, self.ndims) else: self.ndims = self.word2vec.ndims def preprocess(self, query, clear): if clear: words = clean_str(query) else: words = query.strip().split() return words def mapping(self, query, clear=True): words = self.preprocess(query, clear) #print query, '->', words renamed, vectors = self.word2vec.read(words) renamed2vec = dict(zip(renamed, vectors)) if len(renamed) != len(words): vectors = [] for word in words: if word in renamed2vec: vectors.append(renamed2vec[word]) if len(vectors) > 0: vec = np.array(vectors).mean(axis=0) if self.L1_normalize: return self.do_L1_norm(vec) if self.L2_normalize: return self.do_L2_norm(vec) return vec else: return None
class PSI_pre(object): def __init__(self, model_path, bow_path, feat_path): # voabulary_file = os.path.join('result', 'msr2013train_voabulary_query_bow.pkl') self.count_vect, self.tf_transformer = cPickle.load(open(bow_path, 'rb')) self.img_feats = BigFile(feat_path) # print model_path devise_model = cPickle.load(open(model_path, 'rb')) # words_vec = T.matrix(dtype=theano.config.floatX) words_vec = sparse.csr_matrix(dtype=theano.config.floatX) img_vec = T.matrix(dtype=theano.config.floatX) # compile a predictor function self.predict_model = theano.function( inputs=[words_vec, img_vec], outputs=devise_model.predict_score_one2many(words_vec, img_vec), allow_input_downcast=True) def predict_score(self, query, iid_list): test_counts = self.count_vect.transform([query]) query_vec = self.tf_transformer.transform(test_counts) # print query_vec # print query_vec.shape # if qvec is not None: renamed, test_X = self.img_feats.read(iid_list) X = [] for iid in iid_list: img_label = test_X[renamed.index(iid)] X.append(img_label) image_array = np.array(X) temp = self.predict_model(query_vec, image_array) scorelist = np.reshape(temp,(1,-1))[0].tolist() return scorelist
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma = options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom( collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append( calParzen(img_feats.read_one(imidx), test_X, sigma)) # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma) # parzen_list = [] # for imidx in iid_list: # parzen_list.append(parzen_list_suffle[renamed.index(imidx)]) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v: v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) / len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file, 'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job nr_bins = options.nr_bins best_param_dir = options.best_param_dir beta = 0.5 modelName = 'fik%d' % nr_bins if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 feat_dir = os.path.join(rootpath,trainCollection,'FeatureData',feature) feat_file = BigFile(feat_dir) params = {'nr_bins': nr_bins} with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) model = svm_train(y, vectors, svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI")) newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s'%model_file_name) fiksvm_save_model(model_file_name, newmodel) # reload the model file to do a simple check fiksvm_load_model(model_file_name) assert(abs(newmodel.get_probAB()[0]-A)<1e-6) assert(abs(newmodel.get_probAB()[1]-B)<1e-6) return len(todo)
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection, valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert (concepts[i] == valConcepts[i]) resultdir = os.path.join( rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params' % modelName, '%s,%s,%s' % (valCollection, valAnnotationName, feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) val_feat_file = BigFile( os.path.join(rootpath, valCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims assert (feat_dim == val_feat_file.ndims) for concept in todo: names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn names, labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names, labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v: v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A, B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus( INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma =options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma)) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file,'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
class sentence: rootpath = ROOT_PATH vis_feat = VIS_FEAT sent_collection = SENT_COLLECTION img_collection = IMG_COLLECTION sent_feat_dir = os.path.join(rootpath, sent_collection, "FeatureData", vis_feat) sent_id_file = os.path.join(sent_feat_dir, 'id.txt') shape_file = os.path.join(sent_feat_dir, 'shape.txt') sent_file = os.path.join(rootpath, sent_collection, 'TextData', '%s.txt' % sent_collection) def __init__(self, db_file): self.nr_of_sents, self.feat_dim = map(int, open(self.shape_file).readline().split()) self.sent_pool = map(str.strip, open(self.sent_file).readlines()) self.sent_searcher = load_model(os.path.join(self.sent_feat_dir, 'feature.bin'), self.feat_dim, self.nr_of_sents, self.sent_id_file) self.sent_searcher.set_distance('cosine') feat_dir = os.path.join(self.rootpath, self.img_collection, "FeatureData", self.vis_feat) self.vis_feat_file = BigFile(feat_dir) imageSetFile = open(os.path.join(self.rootpath, self.img_collection, "ImageSets", "%s.txt"%self.img_collection), 'r') self.imageSet = imageSetFile.readlines() self.db_file = db_file def getSentence(self, imageID): image = [self.imageSet[imageID].replace("\n", "")] renamed, vectors = self.vis_feat_file.read(image) result = [] for i in range(len(renamed)): sent_list = self.sent_searcher.search_knn(vectors[i], max_hits=10) logger.info('query img', renamed[i]) for sent_id, distance in sent_list[:5]: logger.info(self.sent_pool[int(sent_id[4:])].decode('utf-8')) result.extend([self.sent_pool[int(sent_id[4:])].decode('utf-8')]) print ('') return result def save_sentence(self, user_id, image_id, submit_time, suggested_sentence, rank, submitted_sentence, labels, real_id): conn = sqlite3.connect(self.db_file) conn.text_factory = str cursor = conn.execute("SELECT user_id FROM STATE \ WHERE user_id = %d AND image_id = %d" % (int(user_id), image_id)) judge = -1 for row in cursor: judge = row[0] if judge == -1: conn.execute("INSERT INTO STATE (USER_ID, IMAGE_ID, SUBMIT_TIME, SUGGESTED_SENTENCE, RANK, SUBMITTED_SENTENCE, SUBMITTED_LABEL, REAL_IMAGE_ID) \ VALUES (%d, %d, %f, '%s', %d, '%s', '%s', %d)" % ( user_id, image_id, submit_time, suggested_sentence.decode('gbk'), rank, submitted_sentence.decode('gbk'), labels, real_id)) else: conn.execute("UPDATE STATE SET submit_time=%f, suggested_sentence='%s', rank=%d, submitted_sentence='%s', submitted_label='%s' \ WHERE user_id = %d AND image_id = %d" % ( submit_time, suggested_sentence.decode('gbk'), rank, submitted_sentence.decode('gbk'), labels, int(user_id), image_id)) conn.commit() conn.close() def get_sentence(self, user_id, page): data = [] img = image.image(self.db_file) conn = sqlite3.connect(self.db_file) conn.text_factory = str cursor = conn.execute("SELECT count(image_id) FROM STATE WHERE user_id=%d" % user_id) for row in cursor: count = row[0] if (page - 1) * PAGE_LIMIT > count: return False, None, None cursor = conn.execute("SELECT user_id, image_id, submit_time, suggested_sentence, rank, submitted_sentence, submitted_label FROM STATE \ WHERE user_id = %d ORDER BY submit_time DESC LIMIT %d OFFSET %d" % ( user_id, PAGE_LIMIT, (page - 1) * PAGE_LIMIT)) import userControl as u user_control = u.user(self.db_file) for row in cursor: image_id = row[1] #import userControl as u #user_control = u.user(self.db_file) j, iid, image_id = user_control.getimageid(user_id, image_id) url = IMAGE_ROOT + img.getimagename(iid) submit_time = row[2] suggested_sentence = row[3] rank = row[4] submitted_sentence = row[5] submitted_label = row[6] set = {'image_id': image_id, 'url': url, 'submit_time': submit_time, 'suggested_sentence': suggested_sentence.encode('gbk'), 'rank': rank, 'submitted_sentence': submitted_sentence.encode('gbk'), 'submitted_label': submitted_label} data = data + [set] conn.close() import math return True, data, math.ceil(float(count) / PAGE_LIMIT) def get_sentence_by_imageid(self, user_id, image_id): conn = sqlite3.connect(self.db_file) conn.text_factory = str cursor = conn.execute("SELECT user_id, image_id, submit_time, suggested_sentence, rank, submitted_sentence, submitted_label FROM STATE \ WHERE user_id = %d AND image_id = %d" % (int(user_id), image_id)) count = 0 for row in cursor: submitted_label = row[6] submitted_sentence = row[5] rank = row[4] count += 1 conn.close() if count == 0: return 0, '', '' logger.info(submitted_label) return rank, submitted_sentence.encode('gbk'), ', '.join( filter(lambda x: x, submitted_label.split(', '))) def getNumber(self, user_id): conn = sqlite3.connect(self.db_file) conn.text_factory = str cursor = conn.execute("SELECT user_id, image_id, submit_time, suggested_sentence, rank, submitted_sentence FROM STATE \ WHERE user_id = %d" % int(user_id)) number = 0 for row in cursor: number += 1 return number def getAll(self, start, end): user_control = userControl.user(self.db_file) conn = sqlite3.connect(self.db_file) conn.text_factory = str cursor = conn.execute("SELECT user_id, count(image_id) FROM STATE WHERE SUBMIT_TIME < %f AND SUBMIT_TIME > %f \ GROUP BY user_id" % (end, start)) logger.info("SELECT user_id, count(image_id) FROM STATE WHERE SUBMIT_TIME < %f AND SUBMIT_TIME > %f GROUP BY user_id" % (end, start)) data = [] for row in cursor: user_id = row[0] count = row[1] set = [user_id, count, user_control.getusername(user_id)] data = data + [set] return data def get_image_info(self, image_id): user_control = userControl.user(self.db_file) conn = sqlite3.connect(self.db_file) cursor = conn.execute( "SELECT user_id, submitted_sentence, submitted_label FROM STATE WHERE real_image_id=%d" % image_id) data = [] for row in cursor: user_id = row[0] submitted_sentence = row[1] submitted_label = row[2] set = [user_control.getusername(user_id), submitted_sentence, submitted_label] data = data + [set] return data
def __init__(self, corpus, modelName, rootpath=ROOT_PATH): word2vec_dir = os.path.join(rootpath, "word2vec", corpus, modelName) self.word2vec = BigFile(word2vec_dir)
if __name__ == '__main__': rootpath = ROOT_PATH embedding_model = 'hierse2' embedding_name = 'flickr4m,tagvec500,%s' % embedding_model tagger = ZeroshotTagger(embedding_name = embedding_name) label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', 'imagenet1k', embedding_name) from im2vec import Image2Vec i2v = Image2Vec(label_file, label2vec_dir) from basic.util import readImageSet testCollection = 'imagenet2hop' imset = readImageSet(testCollection, 'random100k', rootpath) feature = 'dascaffeprob' feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) blocksize = 1000 start = 0 from eval import HitScorer scorers = [HitScorer(n) for n in [1, 2, 5, 10]] overall_perf = [0.0] * len(scorers) nr_of_images = 0 while start < len(imset): end = min(len(imset), start + blocksize) renamed, vectors = feat_file.read(imset[start:end]) for _id,_vec in zip(renamed, vectors):
from basic.annotationtable import readAnnotationsFrom from simpleknn.bigfile import BigFile ROOT_PATH = '/home/root123/xirong/VisualSearch' rootpath = ROOT_PATH trainCollection = 'flickr81train' trainAnnotationName = 'concepts81train.random50.0.random50.0.txt' testCollection = "flickr81test" testAnnotationName = 'conceptsflickr81test.txt' feature = "dascaffeprob" feat_dim = 1000 scorer = getScorer("AP") targetConcept = sys.argv[1] #"aeroplane" train_feat_file = BigFile(os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim) test_feat_file = BigFile(os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim) testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000) minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: min_vals = map(float, str.split(f.readline())) max_vals = map(float, str.split(f.readline())) [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath) name2label = dict(zip(names,labels)) (renamed, vectors) = train_feat_file.read(names) relabeled = [name2label[x] for x in renamed] #label is either 1 or -1 [names,labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath)
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert(concepts[i] == valConcepts[i]) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims assert(feat_dim == val_feat_file.ndims) for concept in todo: names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names,labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v:v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A,B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
class ImageSimer: def __init__(self, dev_feat_path, train_feat_path): self.dev_feats = BigFile(dev_feat_path) self.train_feats = BigFile(train_feat_path) def calsimImage(self, img, imgs): imgfeat = self.dev_feats.read_one(img) renamed, test_X = self.train_feats.read(imgs) resorted_feats = [None] * len(renamed) for i in xrange(len(renamed)): idx = imgs.index(renamed[i]) resorted_feats[idx] = test_X[i] return calImageSimiByCos( imgfeat, resorted_feats) def calsimiImagewithClick(self, img, img_click_list, clickthres): imgfeat = self.dev_feats.read_one(img) img_list = [x[0] for x in img_click_list if int(x[1]) >= clickthres] clc_list = [int(x[1]) for x in img_click_list if int(x[1]) >= clickthres] assert (len(img_list) == len(clc_list)) renamed, test_X = self.train_feats.read(img_list) # re-sort the label list according to the renamed resorted_feats = [None] * len(renamed) for i in xrange(len(renamed)): idx = img_list.index(renamed[i]) resorted_feats[idx] = test_X[i] img_simi = calImageSimiByCos( imgfeat, resorted_feats) return sum(np.array(img_simi) * np.log(clc_list)) / len(img_list) def clasimiImgwithWeightImgs(self, img, imgs, weightes): assert(len(imgs) == len(weightes)) imgfeat = self.dev_feats.read_one(img) renamed, feats = self.train_feats.read(imgs) # re-sort the label list according to the renamed resorted_weight = [None] * len(weightes) for i in xrange(len(renamed)): idx = imgs.index(renamed[i]) resorted_weight[i] = weightes[idx] simi_list = calImageSimiByCos(imgfeat, feats) normal_weight = np.array(resorted_weight) / sum(resorted_weight) score = np.dot(normal_weight, np.array(simi_list) ) return score def simiImgs_WeightImgs(self, t_img_list, s_img_list, weightes): assert(len(s_img_list) == len(weightes)) t_renamed, t_feats = self.dev_feats.read(t_img_list) s_renamed, s_feats = self.train_feats.read(s_img_list) # re-sort the label list according to the renamed resorted_weight = [None] * len(weightes) for i in xrange(len(s_renamed)): idx = s_img_list.index(s_renamed[i]) resorted_weight[i] = weightes[idx] normal_weight = np.array(resorted_weight) / sum(resorted_weight) cosineSimi = -(distance.cdist(t_feats, s_feats, 'cosine')-1) weightSimi = np.dot(cosineSimi, normal_weight) renamed2sim = dict(zip(t_renamed, list(weightSimi))) final_score = [] for key in t_img_list: final_score.append(renamed2sim[key]) return final_score
testAnnotationName = 'conceptsvoc2008val.txt' feature = 'dsift' modelName = 'fastlinear' modelName = 'fik50' metric = 'AP' scorer = getScorer(metric) if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model test_imset = readImageSet(testCollection, testCollection, rootpath=rootpath) test_feat_file = BigFile(os.path.join(rootpath,testCollection,'FeatureData',feature)) test_renamed, test_vectors = test_feat_file.read(test_imset) concepts = readConcepts(testCollection, testAnnotationName, rootpath=rootpath) print ('### %s' % os.path.join(trainCollection, 'Models', trainAnnotationName, feature, modelName)) results = [] for concept in concepts: model_file_name = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName, '%s.model' % concept) model = load_model(model_file_name) ranklist = [(test_renamed[i], model.predict(test_vectors[i])) for i in range(len(test_renamed))] ranklist.sort(key=lambda v:v[1], reverse=True) names,labels = readAnnotationsFrom(testCollection, testAnnotationName, concept, skip_0=True, rootpath=rootpath)
def __init__(self, dev_feat_path, train_feat_path): self.dev_feats = BigFile(dev_feat_path) self.train_feats = BigFile(train_feat_path)
def test_tagging(self): corpus = 'flickr4m' word2vec_model = 'tagvec500' testCollection = 'imagenet2hop-random2k' imset = readImageSet(testCollection, testCollection, rootpath) feature = 'dascaffeprob' feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) blocksize = 1000 scorers = [HitScorer(n) for n in [1, 2, 5, 10]] overwrite = 1 for embedding_model in str.split('conse conse2 hierse hierse2'): embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model) for synset_name in str.split('imagenet1k imagenet1k2hop'): if 'imagenet1k' == synset_name: label_file = 'data/ilsvrc12/synsets.txt' else: label_file = 'data/ilsvrc12/synsets2hop.txt' params = '%s %s --embedding %s --word2vec %s --corpus %s --overwrite %d' % (label_file, synset_name, embedding_model, word2vec_model, corpus, overwrite) os.system('python build_synset_vec.py %s' % params) shape_file = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name, 'shape.txt') self.assertTrue(os.path.exists(shape_file), msg="%s is not ready" % synset_name) synset_name = 'imagenet1k' label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name) i2v = Image2Vec(label_file, label2vec_dir) tagger = ZeroshotTagger(embedding_name = embedding_name) printStatus(INFO, 'tagging %d images' % len(imset)) start = 0 overall_perf = [0.0] * len(scorers) nr_of_images = 0 while start < len(imset): end = min(len(imset), start + blocksize) renamed, vectors = feat_file.read(imset[start:end]) for _id,_vec in zip(renamed, vectors): truth = set([_id.split('_')[0]]) im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec) sorted_labels = [int(x[0] in truth) for x in pred] perf = [scorer.score(sorted_labels) for scorer in scorers] overall_perf = [overall_perf[i] + perf[i] for i in range(len(scorers))] nr_of_images += 1 start = end res = [x/nr_of_images for x in overall_perf] print '_'*100 print embedding_name print ' '.join([x.name() for x in scorers]) print ' '.join(['%.3f' % x for x in res]) print '_'*100
testset = testCollection testAnnotationName = 'conceptsvoc2008val.txt' modelName = 'fik50' #modelName = 'fastlinear' if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model scorer = getScorer(metric) imset = readImageSet(testCollection,testset,rootpath=rootpath) concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath) feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature) feat_file = BigFile(feat_dir) _renamed, _vectors = feat_file.read(imset) nr_of_images = len(_renamed) nr_of_concepts = len(concepts) mAP = 0.0 models = [None] * len(concepts) stream = StreamFile(feat_dir) for i,concept in enumerate(concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concept) model = load_model(model_file_name) #print model.get_probAB()