示例#1
0
文件: model.py 项目: leJson/w2vvpp
def get_we(vocab, w2v_dir):
    w2v = BigFile(w2v_dir)
    ndims = w2v.ndims
    nr_words = len(vocab)
    words = [vocab[i] for i in range(nr_words)]
    we = np.random.uniform(low=-1.0, high=1.0, size=(nr_words, ndims))

    renamed, vecs = w2v.read(words)
    for i, word in enumerate(renamed):
        idx = vocab.find(word)
        we[idx] = vecs[i]

    return torch.Tensor(we)
示例#2
0
class Text2W2VEncoder:
    def __init__(self, data_path):
        self.w2v = BigFile(data_path)
        vocab_size, self.ndims = self.w2v.shape()
        print("Text2W2VEncoder", "vocab_size", vocab_size, "dim", self.ndims)

    def encode(self, words):
        renamed, vectors = self.w2v.read(words)

        if len(vectors) > 0:
            vec = np.array(vectors).mean(axis=0)
        else:
            vec = np.zeros([self.ndims])
        return torch.Tensor(vec)
示例#3
0
class W2Vec(Txt2Vec):
    def __init__(self, data_path, norm=0, clean=True):
        super(W2Vec, self).__init__(data_path, norm, clean)
        self.w2v = BigFile(data_path)
        vocab_size, self.ndims = self.w2v.shape()
        logger.info('vob size: %d, vec dim: %d' % (vocab_size, self.ndims))

    def _encoding(self, words):
        renamed, vectors = self.w2v.read(words)

        if len(vectors) > 0:
            vec = np.array(vectors).mean(axis=0)
        else:
            vec = np.zeros(self.ndims, )
        return vec
示例#4
0
class DataBatchIterator(object):
    def __init__(self,
                 collections,
                 concept_files,
                 feature,
                 batch_size=100,
                 rootpath=ROOT_PATH):
        assert (len(collections) == len(concept_files))
        self.batch_size = batch_size
        self.feat_file = BigFile(
            os.path.join(rootpath, collections[0], 'FeatureData', feature))
        self.label_set = LabelSet(collections[0], concept_files[0], rootpath)
        self.aux_label_set = None

        if len(collections) > 1:
            self.aux_label_set = LabelSet(collections[1], concept_files[1],
                                          rootpath)

        self.img_ids = sorted(self.label_set.im2labels.keys())
        self.num_labels = self.label_set.num_labels
        self.aux_num_labels = self.aux_label_set.num_labels if self.aux_label_set else 0
        self.update()

    def update(self):
        self.num_samples = len(self.img_ids)
        self.num_batches = int(
            np.ceil(self.num_samples / float(self.batch_size)))
        self.feat_dim = self.feat_file.ndims

    def shuffle(self):
        logger.info('dataset shuffle')
        random.shuffle(self.img_ids)

    def __iter__(self):
        n_samples = self.num_samples
        bs = self.batch_size

        for i in range((n_samples + bs - 1) // bs):
            start = i * bs
            end = min(n_samples, start + bs)
            renamed, feats = self.feat_file.read(self.img_ids[start:end])
            Y = self.label_set.get_label_matrix(
                renamed) if self.label_set else None
            YE = self.aux_label_set.get_label_matrix(
                renamed) if self.aux_label_set else None
            yield (renamed, np.asarray(feats), Y, YE)
示例#5
0
def process(options, feat_dir, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, 'feature.bin')
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    feat_file = BigFile(feat_dir)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, 'wb')

    done = []
    start = 0
  
    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))
        toread = imset[start:end]
        if len(toread) == 0:
            break
        renamed, vectors = feat_file.read(toread)
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert(len(done) == len(set(done)))
    with open(os.path.join(result_dir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(done))
        fw.close()
    
    with open(os.path.join(result_dir,'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(done), feat_file.ndims))
        fw.close()
    print '%d requested, %d obtained' % (len(imset), len(done))
示例#6
0
import os, random

import simpleknn
from bigfile import BigFile

rootpath = '/Users/xirong/VisualSearch'
collection = 'train10k'
nr_of_images = 10000
feature = 'color64'
dim = 64

feature_dir = os.path.join(rootpath,collection,'FeatureData',feature)
feature_file = BigFile(feature_dir, dim)
imset = map(str.strip, open(os.path.join(rootpath,collection,'ImageSets','%s.txt'%collection)).readlines())
imset = random.sample(imset, 10)

searcher = simpleknn.load_model(os.path.join(feature_dir, "feature.bin"), dim, nr_of_images, os.path.join(feature_dir, "id.txt"))
searcher.set_distance('l1')
renamed,vectors = feature_file.read(imset)

for name,vec in zip(renamed,vectors):
    visualNeighbors = searcher.search_knn(vec, max_hits=100)
    print name, visualNeighbors[:3]
示例#7
0
import os, random

import simpleknn
from bigfile import BigFile

rootpath = '/Users/xirong/VisualSearch'
collection = 'train10k'
nr_of_images = 10000
feature = 'color64'
dim = 64

feature_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
feature_file = BigFile(feature_dir, dim)
imset = map(
    str.strip,
    open(os.path.join(rootpath, collection, 'ImageSets',
                      '%s.txt' % collection)).readlines())
imset = random.sample(imset, 10)

searcher = simpleknn.load_model(os.path.join(feature_dir, "feature.bin"), dim,
                                nr_of_images,
                                os.path.join(feature_dir, "id.txt"))
searcher.set_distance('l1')
renamed, vectors = feature_file.read(imset)

for name, vec in zip(renamed, vectors):
    visualNeighbors = searcher.search_knn(vec, max_hits=100)
    print name, visualNeighbors[:3]
示例#8
0
import sys
import os

from bigfile import BigFile

os.system("python merge_feat.py f3d toydata,toydata,toydata,toydata2 newdata --rootpath ./ --overwrite 1")

feat_file = BigFile("newdata/FeatureData/f3d")
renamed, vectors = feat_file.read(feat_file.names)

for _id, _vec in zip(renamed, vectors):
    print _id, _vec
示例#9
0
import sys
import os

from bigfile import BigFile

os.system(
    'python merge_feat.py f3d toydata,toydata,toydata,toydata2 newdata --rootpath ./ --overwrite 1'
)

feat_file = BigFile('newdata/FeatureData/f3d')
renamed, vectors = feat_file.read(feat_file.names)

for _id, _vec in zip(renamed, vectors):
    print _id, _vec