def get_we(vocab, w2v_dir): w2v = BigFile(w2v_dir) ndims = w2v.ndims nr_words = len(vocab) words = [vocab[i] for i in range(nr_words)] we = np.random.uniform(low=-1.0, high=1.0, size=(nr_words, ndims)) renamed, vecs = w2v.read(words) for i, word in enumerate(renamed): idx = vocab.find(word) we[idx] = vecs[i] return torch.Tensor(we)
class Text2W2VEncoder: def __init__(self, data_path): self.w2v = BigFile(data_path) vocab_size, self.ndims = self.w2v.shape() print("Text2W2VEncoder", "vocab_size", vocab_size, "dim", self.ndims) def encode(self, words): renamed, vectors = self.w2v.read(words) if len(vectors) > 0: vec = np.array(vectors).mean(axis=0) else: vec = np.zeros([self.ndims]) return torch.Tensor(vec)
class W2Vec(Txt2Vec): def __init__(self, data_path, norm=0, clean=True): super(W2Vec, self).__init__(data_path, norm, clean) self.w2v = BigFile(data_path) vocab_size, self.ndims = self.w2v.shape() logger.info('vob size: %d, vec dim: %d' % (vocab_size, self.ndims)) def _encoding(self, words): renamed, vectors = self.w2v.read(words) if len(vectors) > 0: vec = np.array(vectors).mean(axis=0) else: vec = np.zeros(self.ndims, ) return vec
class DataBatchIterator(object): def __init__(self, collections, concept_files, feature, batch_size=100, rootpath=ROOT_PATH): assert (len(collections) == len(concept_files)) self.batch_size = batch_size self.feat_file = BigFile( os.path.join(rootpath, collections[0], 'FeatureData', feature)) self.label_set = LabelSet(collections[0], concept_files[0], rootpath) self.aux_label_set = None if len(collections) > 1: self.aux_label_set = LabelSet(collections[1], concept_files[1], rootpath) self.img_ids = sorted(self.label_set.im2labels.keys()) self.num_labels = self.label_set.num_labels self.aux_num_labels = self.aux_label_set.num_labels if self.aux_label_set else 0 self.update() def update(self): self.num_samples = len(self.img_ids) self.num_batches = int( np.ceil(self.num_samples / float(self.batch_size))) self.feat_dim = self.feat_file.ndims def shuffle(self): logger.info('dataset shuffle') random.shuffle(self.img_ids) def __iter__(self): n_samples = self.num_samples bs = self.batch_size for i in range((n_samples + bs - 1) // bs): start = i * bs end = min(n_samples, start + bs) renamed, feats = self.feat_file.read(self.img_ids[start:end]) Y = self.label_set.get_label_matrix( renamed) if self.label_set else None YE = self.aux_label_set.get_label_matrix( renamed) if self.aux_label_set else None yield (renamed, np.asarray(feats), Y, YE)
def process(options, feat_dir, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) feat_file = BigFile(feat_dir) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) toread = imset[start:end] if len(toread) == 0: break renamed, vectors = feat_file.read(toread) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) with open(os.path.join(result_dir, 'id.txt'), 'w') as fw: fw.write(' '.join(done)) fw.close() with open(os.path.join(result_dir,'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(done), feat_file.ndims)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
import os, random import simpleknn from bigfile import BigFile rootpath = '/Users/xirong/VisualSearch' collection = 'train10k' nr_of_images = 10000 feature = 'color64' dim = 64 feature_dir = os.path.join(rootpath,collection,'FeatureData',feature) feature_file = BigFile(feature_dir, dim) imset = map(str.strip, open(os.path.join(rootpath,collection,'ImageSets','%s.txt'%collection)).readlines()) imset = random.sample(imset, 10) searcher = simpleknn.load_model(os.path.join(feature_dir, "feature.bin"), dim, nr_of_images, os.path.join(feature_dir, "id.txt")) searcher.set_distance('l1') renamed,vectors = feature_file.read(imset) for name,vec in zip(renamed,vectors): visualNeighbors = searcher.search_knn(vec, max_hits=100) print name, visualNeighbors[:3]
import os, random import simpleknn from bigfile import BigFile rootpath = '/Users/xirong/VisualSearch' collection = 'train10k' nr_of_images = 10000 feature = 'color64' dim = 64 feature_dir = os.path.join(rootpath, collection, 'FeatureData', feature) feature_file = BigFile(feature_dir, dim) imset = map( str.strip, open(os.path.join(rootpath, collection, 'ImageSets', '%s.txt' % collection)).readlines()) imset = random.sample(imset, 10) searcher = simpleknn.load_model(os.path.join(feature_dir, "feature.bin"), dim, nr_of_images, os.path.join(feature_dir, "id.txt")) searcher.set_distance('l1') renamed, vectors = feature_file.read(imset) for name, vec in zip(renamed, vectors): visualNeighbors = searcher.search_knn(vec, max_hits=100) print name, visualNeighbors[:3]
import sys import os from bigfile import BigFile os.system("python merge_feat.py f3d toydata,toydata,toydata,toydata2 newdata --rootpath ./ --overwrite 1") feat_file = BigFile("newdata/FeatureData/f3d") renamed, vectors = feat_file.read(feat_file.names) for _id, _vec in zip(renamed, vectors): print _id, _vec
import sys import os from bigfile import BigFile os.system( 'python merge_feat.py f3d toydata,toydata,toydata,toydata2 newdata --rootpath ./ --overwrite 1' ) feat_file = BigFile('newdata/FeatureData/f3d') renamed, vectors = feat_file.read(feat_file.names) for _id, _vec in zip(renamed, vectors): print _id, _vec