예제 #1
0
def process(options, source_dir, feat_dim, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, 'feature.bin')
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    featurefile = BigFile(source_dir, feat_dim)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, 'wb')

    done = []
    start = 0
  
    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))
        renamed, vectors = featurefile.read(imset[start:end])
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert(len(done) == len(set(done)))
    resultfile = os.path.join(result_dir, 'id.txt')
    fw = open(resultfile, 'w')
    fw.write(' '.join(done))
    fw.close()

    print '%d requested, %d obtained' % (len(imset), len(done))
예제 #2
0
class Synset2Vec:
    def __init__(self,
                 corpus=DEFAULT_W2V_CORPUS,
                 w2v_name=DEFAULT_W2V,
                 wnid2words_file=DEFAULT_WNID2WORDS_FILE,
                 rootpath=ROOT_PATH):
        word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', w2v_name)
        self.word2vec = BigFile(word2vec_dir)
        self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
        logger.info('w2v(%s): %d words, %d dims', corpus,
                    self.word2vec.shape()[0], self.get_feat_dim())

    def get_feat_dim(self):
        return self.word2vec.ndims

    def explain(self, wnid):
        return self.wnid2words[wnid]

    def _mapping(self, query_wnid):
        words = self.wnid2words[query_wnid].lower()
        words = [w.strip().replace(' ', '_') for w in words.split(',')]
        words = [w.replace('-', '_') for w in words]
        for w in words:
            vec = self.word2vec.read_one(w)
            if vec:
                return vec
        return None

    def embedding(self, wnid):
        return self._mapping(wnid)
예제 #3
0
    def __init__(self,
                 Y0=DEFAULT_Y0,
                 label_vec_name=DEFAULT_LABEL_VEC_NAME,
                 rootpath=ROOT_PATH):
        label_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  'data/synsets_%s.txt' % Y0)
        label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0,
                                     label_vec_name)

        self.labels = map(str.strip, open(label_file).readlines())
        self.nr_of_labels = len(self.labels)

        feat_file = BigFile(label2vec_dir)
        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        self.feat_dim = feat_file.ndims

        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(
                vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len(
            [x for x in self.label_vectors if x is None])
        logger.info('#active_labels=%d, embedding_size=%d',
                    self.nr_of_labels - nr_of_inactive_labels, self.feat_dim)
def process(options, testCollection):
    overwrite = options.overwrite
    rootpath = options.rootpath

    Y0 = options.Y0
    Y1 = options.Y1
    pY0 = options.pY0
    r = options.r

    batch_size = 2000

    label_vec_name = '%s,%s,%s' % (options.w2v_corpus, options.w2v,
                                   options.embedding)
    for synset_name in [Y0, Y1]:
        assert (os.path.exists(
            os.path.join(rootpath, 'synset2vec', synset_name, label_vec_name)))

    resfile = os.path.join(rootpath, testCollection, 'autotagging',
                           testCollection, pY0, label_vec_name,
                           'id.tagvotes.txt')

    if os.path.exists(resfile) and not overwrite:
        logger.info('%s exists. quit', resfile)
        return 0

    i2v = Image2Vec(Y0=Y0, label_vec_name=label_vec_name)
    tagger = ZeroshotTagger(Y1=Y1,
                            label_vec_name=label_vec_name,
                            rootpath=rootpath)

    imset = utility.readImageSet(testCollection, testCollection, rootpath)
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0)
    feat_file = BigFile(feat_dir)

    logger.info('tagging %d images', len(imset))
    utility.makedirsforfile(resfile)
    logger.info('save results to %s', resfile)

    fw = open(resfile, 'w')

    start = 0
    while start < len(imset):
        end = min(len(imset), start + batch_size)
        logger.info('processing images from %d to %d', start, end)
        todo = imset[start:end]
        if not todo:
            break

        renamed, vectors = feat_file.read(todo)
        output = []
        for _id, _vec in zip(renamed, vectors):
            im_vec = i2v.embedding(_vec)
            pred = tagger.predict(im_vec, topk=options.r)
            output.append(
                '%s %s\n' %
                (_id, ' '.join(['%s %s' % (x[0], x[1]) for x in pred])))
        start = end
        fw.write(''.join(output))

    fw.close()
예제 #5
0
def get_we_parameter(vocabulary, word2vec_file):
    print('getting inital word embedding ...')
    w2v_reader = BigFile(word2vec_file)
    ndims = w2v_reader.ndims    
    #print("word embedding dim ", ndims)
    #print(vocabulary)
    #sys.exit()
    we = []
    # Reserve 0 for masking via pad_sequences
    we.append(np.array([0]*ndims))
    fail_counter = 0
    for word in vocabulary:
        word = word.strip()
        try:
            vec = w2v_reader.read_one(word)
            vec = np.array(vec)
            assert vec.shape == (500,)
            we.append(vec)
        except Exception as e:
            # print word
            vec = np.random.uniform(-1,1,ndims)
            #print(vec.shape)
            we.append(vec)
            fail_counter +=1 
    print("%d words out of %d words cannot find pre-trained word2vec vector" % (fail_counter, len(vocabulary)))
    return np.array(we)
예제 #6
0
class Synset2Vec:

    def __init__(self, corpus=DEFAULT_W2V_CORPUS, w2v_name=DEFAULT_W2V, wnid2words_file=DEFAULT_WNID2WORDS_FILE, rootpath=ROOT_PATH):
        word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', w2v_name)
        self.word2vec = BigFile(word2vec_dir)
        self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
        logger.info('w2v(%s): %d words, %d dims', corpus, self.word2vec.shape()[0], self.get_feat_dim())

    def get_feat_dim(self):
        return self.word2vec.ndims

        
    def explain(self, wnid):
        return self.wnid2words[wnid]

    def _mapping(self, query_wnid):
        words = self.wnid2words[query_wnid].lower()
        words = [w.strip().replace(' ','_') for w in words.split(',')]
        words = [w.replace('-', '_') for w in words]
        for w in words:
            vec = self.word2vec.read_one(w)
            if vec:
                return vec
        return None


    def embedding(self, wnid):
        return self._mapping(wnid)
예제 #7
0
 def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0):
     Text2Vec.__init__(self, datafile, ndims, L1_normalize, L2_normalize)
     self.word2vec = BigFile(datafile)
     if ndims != 0:
         assert self.word2vec.ndims == self.ndims, "feat dimension is not match %d != %d" % (
             self.word2vec.ndims, self.ndims)
     else:
         self.ndims = self.word2vec.ndims
예제 #8
0
 def __init__(self,
              corpus,
              modelName,
              wnid2words_file='data/wnid2words.pkl',
              rootpath=ROOT_PATH):
     printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...')
     word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName)
     self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
     self.word2vec = BigFile(word2vec_dir)
예제 #9
0
def process(options, image_collection, pY0):
    rootpath = options.rootpath
    overwrite = options.overwrite
    k = options.k
    batch_size = options.batch_size
    subset = options.subset if options.subset else image_collection
    Y0 = options.Y0
    label_vec_name = options.label_vec_name
    new_feature = '%s,%s,%s' % (Y0, label_vec_name, pY0)

    resfile = os.path.join(rootpath, image_collection, 'FeatureData',
                           new_feature, 'id.feature.txt')
    if os.path.exists(resfile) and not overwrite:
        logger.info('%s exists. quit', resfile)
        return 0

    imsetfile = os.path.join(rootpath, image_collection, 'ImageSets',
                             '%s.txt' % subset)
    imset = map(str.strip, open(imsetfile).readlines())
    logger.info('%d images to do', len(imset))

    feat_file = BigFile(
        os.path.join(rootpath, image_collection, 'FeatureData', pY0))

    im2vec = Image2Vec(Y0, label_vec_name, rootpath)

    utility.makedirsforfile(resfile)
    fw = open(resfile, 'w')

    read_time = 0
    run_time = 0
    start = 0
    done = 0

    while start < len(imset):
        end = min(len(imset), start + batch_size)
        logger.info('processing images from %d to %d', start, end - 1)

        s_time = time.time()
        renamed, test_X = feat_file.read(imset[start:end])
        read_time += time.time() - s_time

        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            vec = im2vec.embedding(test_X[i], k)
            output[i] = '%s %s\n' % (renamed[i], " ".join(map(str, vec)))
        run_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        done += len(output)

    # done
    fw.close()
    logger.info("%d done. read time %g seconds, run_time %g seconds", done,
                read_time, run_time)
    return done
예제 #10
0
 def __init__(self,
              corpus=DEFAULT_W2V_CORPUS,
              w2v_name=DEFAULT_W2V,
              wnid2words_file=DEFAULT_WNID2WORDS_FILE,
              rootpath=ROOT_PATH):
     word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', w2v_name)
     self.word2vec = BigFile(word2vec_dir)
     self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
     logger.info('w2v(%s): %d words, %d dims', corpus,
                 self.word2vec.shape()[0], self.get_feat_dim())
예제 #11
0
 def __init__(self,
              label_source,
              corpus,
              word2vec_model,
              feat_path,
              rootpath=ROOT_PATH):
     label_vec_path = os.path.join('data', label_source, 'label_vec')
     label_id_file = os.path.join('data', label_source, 'label.txt')
     self.im2vec = Image2Vec(label_id_file, label_vec_path)
     self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath)
     self.img_feats = BigFile(feat_path)
예제 #12
0
 def __init__(self, db_file):
     self.nr_of_sents, self.feat_dim = map(int, open(self.shape_file).readline().split())
     self.sent_pool = map(str.strip, open(self.sent_file).readlines())
     self.sent_searcher = load_model(os.path.join(self.sent_feat_dir, 'feature.bin'), self.feat_dim,
                                     self.nr_of_sents, self.sent_id_file)
     self.sent_searcher.set_distance('cosine')
     feat_dir = os.path.join(self.rootpath, self.img_collection, "FeatureData", self.vis_feat)
     self.vis_feat_file = BigFile(feat_dir)
     imageSetFile = open(os.path.join(self.rootpath, self.img_collection, "ImageSets", "%s.txt"%self.img_collection), 'r')
     self.imageSet = imageSetFile.readlines()
     self.db_file = db_file
예제 #13
0
def process(options, testCollection):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding_model = options.embedding
    Y0 = options.Y0
    Y1 = options.Y1
    pY0 = options.pY0
    r = options.r
    blocksize = 2000

    embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model)
    for synset_name in [Y0, Y1]:
        assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)))

    resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    label_file = 'data/ilsvrc12/synsets.txt'
    label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name)
    i2v = Image2Vec(label_file, label2vec_dir)

    tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath)

    imset = readImageSet(testCollection, testCollection, rootpath)
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0)
    feat_file = BigFile(feat_dir)
    

    printStatus(INFO, 'tagging %d images' % len(imset))
    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    start = 0
    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end))
        todo = imset[start:end]
        if not todo:
            break

        renamed, vectors = feat_file.read(todo)
        output = []
        for _id,_vec in zip(renamed, vectors):
            im_vec = i2v.embedding(_vec)
            pred = tagger.predict(im_vec, topk=options.r)
            output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred])))
        start = end
        fw.write(''.join(output))

    fw.close()
예제 #14
0
def process(options, testCollection):
    overwrite = options.overwrite
    rootpath = options.rootpath
    
    Y0 = options.Y0
    Y1 = options.Y1
    pY0 = options.pY0
    r = options.r

    batch_size = 2000

    label_vec_name = '%s,%s,%s' % (options.w2v_corpus, options.w2v, options.embedding)
    for synset_name in [Y0, Y1]:
        assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, label_vec_name)))

    resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, pY0, label_vec_name, 'id.tagvotes.txt')

    if os.path.exists(resfile) and not overwrite:
        logger.info('%s exists. quit', resfile)
        return 0

    i2v = Image2Vec(Y0=Y0, label_vec_name=label_vec_name)
    tagger = ZeroshotTagger(Y1=Y1, label_vec_name=label_vec_name, rootpath=rootpath)

    imset = utility.readImageSet(testCollection, testCollection, rootpath)
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0)
    feat_file = BigFile(feat_dir)
    

    logger.info('tagging %d images', len(imset))
    utility.makedirsforfile(resfile)
    logger.info('save results to %s', resfile)

    fw = open(resfile, 'w')

    start = 0
    while start < len(imset):
        end = min(len(imset), start + batch_size)
        logger.info('processing images from %d to %d', start, end)
        todo = imset[start:end]
        if not todo:
            break

        renamed, vectors = feat_file.read(todo)
        output = []
        for _id,_vec in zip(renamed, vectors):
            im_vec = i2v.embedding(_vec)
            pred = tagger.predict(im_vec, topk=options.r)
            output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred])))
        start = end
        fw.write(''.join(output))

    fw.close()
예제 #15
0
파일: devise.py 프로젝트: li-xirong/cmrf
    def __init__(self, model_path, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH):
        self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath)
        self.img_feats = BigFile(feat_path)

        print model_path
        devise_model = cPickle.load(open(model_path, 'rb'))
        words_vec = T.matrix(dtype=theano.config.floatX)
        img_vec = T.matrix(dtype=theano.config.floatX)
        # compile a predictor function
        self.predict_model = theano.function(
            inputs=[words_vec, img_vec],
            outputs=devise_model.predict_score_one2many(words_vec, img_vec),
            allow_input_downcast=True)
예제 #16
0
def process(options, label_file, label2vec_dir, testCollection, feature, new_feature):
    rootpath = options.rootpath
    overwrite = options.overwrite
    k = options.k
    blocksize = options.blocksize
    subset = options.subset if options.subset else testCollection

    resfile = os.path.join(rootpath, testCollection, 'FeatureData', new_feature, 'id.feature.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    imsetfile = os.path.join(rootpath, testCollection, 'ImageSets', '%s.txt' % subset)
    imset = map(str.strip, open(imsetfile).readlines())
    printStatus(INFO, '%d images to do' % len(imset))

    feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))

    im2vec = Image2Vec(label_file, label2vec_dir)


    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    read_time = 0
    run_time = 0
    start = 0
    done = 0

    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed, test_X = feat_file.read(imset[start:end])
        read_time += time.time() - s_time
        
        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            vec = im2vec.embedding(test_X[i], k)
            output[i] = '%s %s\n' % (renamed[i], " ".join([niceNumber(x,6) for x in vec]))
        run_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        done += len(output)

    # done    
    printStatus(INFO, "%d done. read time %g seconds, run_time %g seconds" % (done, read_time, run_time))
    fw.close()
    return done
예제 #17
0
파일: im2vec.py 프로젝트: danieljf24/cmrf
    def __init__(self, label_file, label2vec_dir):
        self.labels = map(str.strip, open(label_file).readlines())
        self.nr_of_labels = len(self.labels)
        feat_file = BigFile(label2vec_dir)
        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        self.feat_dim = feat_file.ndims

        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len([x for x in self.label_vectors if x is None])    
        printStatus(INFO, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
예제 #18
0
    def __init__(self, model_path, bow_path, feat_path):
        # voabulary_file = os.path.join('result', 'msr2013train_voabulary_query_bow.pkl')
        self.count_vect, self.tf_transformer = cPickle.load(open(bow_path, 'rb'))
        self.img_feats = BigFile(feat_path)

        # print model_path
        devise_model = cPickle.load(open(model_path, 'rb'))
        # words_vec = T.matrix(dtype=theano.config.floatX)
        words_vec = sparse.csr_matrix(dtype=theano.config.floatX)
        img_vec = T.matrix(dtype=theano.config.floatX)
        # compile a predictor function
        self.predict_model = theano.function(
            inputs=[words_vec, img_vec],
            outputs=devise_model.predict_score_one2many(words_vec, img_vec),
            allow_input_downcast=True)
예제 #19
0
    def __init__(self, label_file, label2vec_dir):
        self.labels = map(str.strip, open(label_file).readlines())
        self.nr_of_labels = len(self.labels)
        feat_file = BigFile(label2vec_dir)
        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        self.feat_dim = feat_file.ndims

        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len([x for x in self.label_vectors if x is None])    
        printStatus(INFO, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
예제 #20
0
class Synset2Vec:

    def __init__(self, corpus, modelName, wnid2words_file='data/wnid2words.pkl', rootpath=ROOT_PATH):
        printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...')
        word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName)
        self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
        self.word2vec = BigFile(word2vec_dir)
    
  

    def get_feat_dim(self):
        return self.word2vec.ndims

        
    def explain(self, wnid):
        return self.wnid2words[wnid]

    def mapping(self, query_wnid):
        words = self.wnid2words[query_wnid].lower()
        words = [w.strip().replace(' ','_') for w in words.split(',')]
        words = [w.replace('-', '_') for w in words]
        for w in words:
            renamed, vectors = self.word2vec.read([w])
            if vectors:
                return vectors[0]
        return None


    def embedding(self, wnid):
        return self.mapping(wnid)
예제 #21
0
class Query2Vec:

    def __init__(self, corpus, modelName, rootpath=ROOT_PATH):
        word2vec_dir = os.path.join(rootpath, "word2vec", corpus, modelName)
        self.word2vec = BigFile(word2vec_dir)

    def get_feat_dim(self):
        return self.word2vec.ndims

    def mapping(self, query):
            #words = query.lower().split(',')
            words = query.lower().split('/')
            res = []
            for word in words:
                res += word.strip().replace('_', ' ').split()

            word_vecs = []
            newname = []
            for w in res:
                renamed, vectors = self.word2vec.read([w])
                if vectors: 
                    word_vecs.append(vectors[0])
                    newname.append(renamed[0])

            #print wnid, res, len(word_vecs)
            if len(word_vecs)>0:
                return np.array(word_vecs).mean(axis=0)
            else:
                return None
                
    def embedding(self, wnid):
        return self.mapping(wnid)
예제 #22
0
class Query2Vec:
    def __init__(self, corpus, modelName, rootpath=ROOT_PATH):
        word2vec_dir = os.path.join(rootpath, "word2vec", corpus, modelName)
        self.word2vec = BigFile(word2vec_dir)

    def get_feat_dim(self):
        return self.word2vec.ndims

    def mapping(self, query):
        #words = query.lower().split(',')
        words = query.lower().split('/')
        res = []
        for word in words:
            res += word.strip().replace('_', ' ').split()

        word_vecs = []
        newname = []
        for w in res:
            renamed, vectors = self.word2vec.read([w])
            if vectors:
                word_vecs.append(vectors[0])
                newname.append(renamed[0])

        #print wnid, res, len(word_vecs)
        if len(word_vecs) > 0:
            return np.array(word_vecs).mean(axis=0)
        else:
            return None

    def embedding(self, wnid):
        return self.mapping(wnid)
예제 #23
0
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature):
    rootpath = options.rootpath
    modelName = options.model

    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model


    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath)
    concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job]

    feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature))

    for concept in concepts:
        modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept)
        model = load_model(modelfile)
        (A0, B0) = model.get_probAB()
        if abs(A0) > 1e-8 and not options.overwrite:
            printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0))
            continue
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names, labels))
        results = classify_large_data(model, names, feat_file, prob_output=False)
        labels = [name2label[x[0]] for x in results]
        dec_values = [x[1] for x in results]
        printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1])))
        [A,B] = sigmoid_train(dec_values, labels)
        model.set_probAB(A, B)
        save_model(modelfile, model)
        (A1, B1) = model.get_probAB()
        printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
예제 #24
0
class SemanticEmbedding:
    def __init__(self, label_source, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH):
        label_vec_path = os.path.join('data', label_source, 'label_vec')
        label_id_file = os.path.join('data', label_source, 'label.txt')
        self.im2vec = Image2Vec(label_id_file, label_vec_path)
        self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath)
        self.img_feats = BigFile(feat_path)

    def do_search(self, query, iid_list, k):
        # convert query to vector
        qvec = self.qry2vec.embedding(query)
        if qvec is not None:

            renamed, test_X = self.img_feats.read(iid_list)

            imgvecs = []
            for iid in iid_list:
                img_label = test_X[renamed.index(iid)]
                imgvecs.append(self.im2vec.embedding(img_label, k))

            scorelist = calImageSimiByCos(qvec, imgvecs)

        else:
            scorelist = []

        return scorelist
예제 #25
0
class Synset2Vec:
    def __init__(self,
                 corpus,
                 modelName,
                 wnid2words_file='data/wnid2words.pkl',
                 rootpath=ROOT_PATH):
        printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...')
        word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName)
        self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
        self.word2vec = BigFile(word2vec_dir)

    def get_feat_dim(self):
        return self.word2vec.ndims

    def explain(self, wnid):
        return self.wnid2words[wnid]

    def mapping(self, query_wnid):
        words = self.wnid2words[query_wnid].lower()
        words = [w.strip().replace(' ', '_') for w in words.split(',')]
        words = [w.replace('-', '_') for w in words]
        for w in words:
            renamed, vectors = self.word2vec.read([w])
            if vectors:
                return vectors[0]
        return None

    def embedding(self, wnid):
        return self.mapping(wnid)
예제 #26
0
class ConSE:
    def __init__(self, label_source, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH):
        label_vec_path = os.path.join('data', label_source, 'label_vec')
        label_id_file = os.path.join('data', label_source, 'label.txt')
        self.im2vec = Image2Vec(label_id_file, label_vec_path)
        self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath)
        self.img_feats = BigFile(feat_path)

    def do_search(self, query, iid_list, k):
        # convert query to vector
        qvec = self.qry2vec.embedding(query)
        if qvec is not None:

            renamed, test_X = self.img_feats.read(iid_list)

            imgvecs = []
            for iid in iid_list:
                img_label = test_X[renamed.index(iid)]
                imgvecs.append(self.im2vec.embedding(img_label, k))

            scorelist = calImageSimiByCos(qvec, imgvecs)

        else:
            scorelist = []

        return scorelist
예제 #27
0
    def __init__(self, input_json_path, vocab, vf_dir, use_att=False, eng_gt_file=None, rootpath=rootpath):

        print input_json_path
        with open(input_json_path) as f:
            data = json.load(f)

        self.eng_gt_file = eng_gt_file
        self.imgname2enggt = {}
        if self.eng_gt_file is not None:
            assert os.path.exists(self.eng_gt_file), "Eng gt file not exist: %s"%eng_gt_file    
            print ('Loading eng gt file')
            eng_data = json.load(open(self.eng_gt_file))
            for x in eng_data['images']:
                img_filename = x['filename']
                sents=[]
                for y in x['sentences']:
                    sents.append(' '.join(y['tokens']))
                self.imgname2enggt[img_filename] = sents

        self.images = data['images']
        self.vocab = vocab
        self.sentences = {}
        self.img2sents = {}
        self.img2enggt = {}
        self.img2filename = {}
        self.sentId2imgId = {}
        self.imgIds = []
        self.sentIds = []
        for img in self.images:
            img_id = img['imgid']
            self.img2filename[img_id] = img['filename'].split('.')[0]
            self.imgIds.append(img_id)
            self.img2sents[img_id] = img['sentids']
            self.img2enggt[img_id] = self.imgname2enggt.get(img['filename'], [])
            for i, sent in enumerate(img['sentences']):
                self.sentences[sent['sentid']] = (sent['tokens'], sent['raw'])
                self.sentIds.append(sent['sentid'])
                self.sentId2imgId[sent['sentid']] = img_id
                sid = img['filename'].split('.')[0]+'#'+str(i)

        self.use_att = use_att
        if self.use_att == True:
            self.vf_dir = vf_dir
        else:
            self.vf_dir = vf_dir
            self.vf_reader = BigFile(vf_dir)
예제 #28
0
파일: tagger.py 프로젝트: li-xirong/hierse
    def __init__(self, Y1=DEFAULT_Y1, label_vec_name=DEFAULT_LABEL_VEC_NAME, rootpath=ROOT_PATH):
        feat_dir = os.path.join(rootpath, 'synset2vec', Y1, label_vec_name)
        feat_file = BigFile(feat_dir)
        self.labels = feat_file.names
        self.nr_of_labels = len(self.labels)
        self.feat_dim = feat_file.ndims

        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        
        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len([x for x in self.label_vectors if x is None])    
        logger.info('#active_labels=%d, embedding_size=%d', self.nr_of_labels - nr_of_inactive_labels, self.feat_dim)
예제 #29
0
    def __init__(self, synset_name='imagenet1k2hop', embedding_name='flickr4m,tagvec500,hierse2', rootpath=ROOT_PATH):
        feat_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)
        feat_file = BigFile(feat_dir)
        self.labels = feat_file.names
        self.nr_of_labels = len(self.labels)
        self.feat_dim = feat_file.ndims

        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        
        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len([x for x in self.label_vectors if x is None])    
        printStatus(INFO + '.' + self.__class__.__name__, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
예제 #30
0
 def __init__(self,
              datafile,
              ndims=0,
              language='en',
              L1_normalize=0,
              L2_normalize=0):
     Text2Vec.__init__(self, datafile, ndims, language, L1_normalize,
                       L2_normalize)
     self.word2vec = BigFile(
         datafile) if language == 'en' else w2v.Word2Vec.load(datafile)
     if ndims != 0:
         if 'en' == language:
             assert self.word2vec.ndims == self.ndims, "feat dimension is not match %d != %d" % (
                 self.word2vec.ndims, self.ndims)
         else:
             print 'ndims #', ndims
     else:
         self.ndims = self.word2vec.ndims if 'en' == language else 500
예제 #31
0
파일: text.py 프로젝트: yelinyun123/coco-cn
def get_en_we_parameter(vocabulary, word2vec_file):
    print 'getting inital word embedding ...'
    w2v_reader = BigFile(word2vec_file)
    ndims = w2v_reader.ndims
    fail_counter = 0
    we = []
    # Reserve 0 for masking via pad_sequences
    we.append([0]*ndims)
    for word in vocabulary:
        word = word.strip()
        try:
            vec = w2v_reader.read_one(word)
            # print vec
            we.append(vec)
        except Exception, e:
            vec = np.random.uniform(-1,1,ndims)
            we.append(vec)
            fail_counter +=1
예제 #32
0
    def __init__(self,
                 Y1=DEFAULT_Y1,
                 label_vec_name=DEFAULT_LABEL_VEC_NAME,
                 rootpath=ROOT_PATH):
        feat_dir = os.path.join(rootpath, 'synset2vec', Y1, label_vec_name)
        feat_file = BigFile(feat_dir)
        self.labels = feat_file.names
        self.nr_of_labels = len(self.labels)
        self.feat_dim = feat_file.ndims

        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels

        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(
                vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len(
            [x for x in self.label_vectors if x is None])
        logger.info('#active_labels=%d, embedding_size=%d',
                    self.nr_of_labels - nr_of_inactive_labels, self.feat_dim)
예제 #33
0
파일: irc_image.py 프로젝트: danieljf/cmrf
class ImageSimer:
    def __init__(self, dev_feat_path, train_feat_path):
        self.dev_feats = BigFile(dev_feat_path)
        self.train_feats =  BigFile(train_feat_path)


    def calsimImage(self, img, imgs):
        imgfeat = self.dev_feats.read_one(img)

        renamed, test_X = self.train_feats.read(imgs)
        resorted_feats = [None] * len(renamed)
        for i in xrange(len(renamed)):
            idx = imgs.index(renamed[i])
            resorted_feats[idx] = test_X[i]

        return calImageSimiByCos( imgfeat, resorted_feats)


    def calsimiImagewithClick(self, img, img_click_list, clickthres):
        
        imgfeat = self.dev_feats.read_one(img)

        img_list = [x[0] for x in img_click_list if int(x[1]) >= clickthres]
        clc_list = [int(x[1]) for x in img_click_list if int(x[1]) >= clickthres]
        assert (len(img_list) == len(clc_list))

        renamed, test_X = self.train_feats.read(img_list)

        # re-sort the label list according to the renamed
        resorted_feats = [None] * len(renamed)
        for i in xrange(len(renamed)):
            idx = img_list.index(renamed[i])
            resorted_feats[idx] = test_X[i]

        img_simi = calImageSimiByCos( imgfeat, resorted_feats)

        return sum(np.array(img_simi) * np.log(clc_list)) / len(img_list)
예제 #34
0
class ImageSimer:
    def __init__(self, dev_feat_path, train_feat_path):
        self.dev_feats = BigFile(dev_feat_path)
        self.train_feats = BigFile(train_feat_path)

    def calsimImage(self, img, imgs):
        imgfeat = self.dev_feats.read_one(img)

        renamed, test_X = self.train_feats.read(imgs)
        resorted_feats = [None] * len(renamed)
        for i in xrange(len(renamed)):
            idx = imgs.index(renamed[i])
            resorted_feats[idx] = test_X[i]

        return calImageSimiByCos(imgfeat, resorted_feats)

    def calsimiImagewithClick(self, img, img_click_list, clickthres):

        imgfeat = self.dev_feats.read_one(img)

        img_list = [x[0] for x in img_click_list if int(x[1]) >= clickthres]
        clc_list = [
            int(x[1]) for x in img_click_list if int(x[1]) >= clickthres
        ]
        assert (len(img_list) == len(clc_list))

        renamed, test_X = self.train_feats.read(img_list)

        # re-sort the label list according to the renamed
        resorted_feats = [None] * len(renamed)
        for i in xrange(len(renamed)):
            idx = img_list.index(renamed[i])
            resorted_feats[idx] = test_X[i]

        img_simi = calImageSimiByCos(imgfeat, resorted_feats)

        return sum(np.array(img_simi) * np.log(clc_list)) / len(img_list)
예제 #35
0
파일: devise.py 프로젝트: li-xirong/cmrf
class Devise_pre(object):
    def __init__(self, model_path, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH):
        self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath)
        self.img_feats = BigFile(feat_path)

        print model_path
        devise_model = cPickle.load(open(model_path, 'rb'))
        words_vec = T.matrix(dtype=theano.config.floatX)
        img_vec = T.matrix(dtype=theano.config.floatX)
        # compile a predictor function
        self.predict_model = theano.function(
            inputs=[words_vec, img_vec],
            outputs=devise_model.predict_score_one2many(words_vec, img_vec),
            allow_input_downcast=True)

    def predict_score(self, query, iid_list, normalization = 'L2'):

        qvec = self.qry2vec.embedding(query)
        
        
        if qvec is not None:

            # L2 normalization
            if normalization == 'L2':
                qvec = qvec / LA.norm(qvec,2)
            
            renamed, test_X = self.img_feats.read(iid_list)

            X = []
            for iid in iid_list:
                img_label = test_X[renamed.index(iid)]
                X.append(img_label)

            query_array = np.array([qvec])
            image_array = np.array(X)

            scorelist = self.predict_model(query_array, image_array)[0].tolist()
            # scorelist =  np.reshape(temp,(1,-1))[0].tolist()

        else:
            scorelist = []

        return scorelist
예제 #36
0
class AveWord2Vec(Text2Vec):

    # datafile: the path of pre-trained word2vec data
    def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0):
        Text2Vec.__init__(self, datafile, ndims, L1_normalize, L2_normalize)
        self.word2vec = BigFile(datafile)
        if ndims != 0:
            assert self.word2vec.ndims == self.ndims, "feat dimension is not match %d != %d" % (
                self.word2vec.ndims, self.ndims)
        else:
            self.ndims = self.word2vec.ndims

    def preprocess(self, query, clear):
        if clear:
            words = clean_str(query)
        else:
            words = query.strip().split()
        return words

    def mapping(self, query, clear=True):
        words = self.preprocess(query, clear)

        #print query, '->', words
        renamed, vectors = self.word2vec.read(words)
        renamed2vec = dict(zip(renamed, vectors))

        if len(renamed) != len(words):
            vectors = []
            for word in words:
                if word in renamed2vec:
                    vectors.append(renamed2vec[word])

        if len(vectors) > 0:
            vec = np.array(vectors).mean(axis=0)

            if self.L1_normalize:
                return self.do_L1_norm(vec)
            if self.L2_normalize:
                return self.do_L2_norm(vec)
            return vec
        else:
            return None
예제 #37
0
class PSI_pre(object):
    def __init__(self, model_path, bow_path, feat_path):
        # voabulary_file = os.path.join('result', 'msr2013train_voabulary_query_bow.pkl')
        self.count_vect, self.tf_transformer = cPickle.load(open(bow_path, 'rb'))
        self.img_feats = BigFile(feat_path)

        # print model_path
        devise_model = cPickle.load(open(model_path, 'rb'))
        # words_vec = T.matrix(dtype=theano.config.floatX)
        words_vec = sparse.csr_matrix(dtype=theano.config.floatX)
        img_vec = T.matrix(dtype=theano.config.floatX)
        # compile a predictor function
        self.predict_model = theano.function(
            inputs=[words_vec, img_vec],
            outputs=devise_model.predict_score_one2many(words_vec, img_vec),
            allow_input_downcast=True)

    def predict_score(self, query, iid_list):

        test_counts = self.count_vect.transform([query])
        query_vec = self.tf_transformer.transform(test_counts)
        # print query_vec
        # print query_vec.shape
        
        # if qvec is not None:
        renamed, test_X = self.img_feats.read(iid_list)

        X = []
        for iid in iid_list:
            img_label = test_X[renamed.index(iid)]
            X.append(img_label)

        image_array = np.array(X)


        temp = self.predict_model(query_vec, image_array)
        scorelist =  np.reshape(temp,(1,-1))[0].tolist()

        return scorelist
예제 #38
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma = options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex',
                                       collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method,
                                   feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations',
                                  'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query = dict(zip(qid_list, query_list))

    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")

    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(
            collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(
                calParzen(img_feats.read_one(imidx), test_X, sigma))

        # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma)
        # parzen_list = []
        # for imidx in iid_list:
        #     parzen_list.append(parzen_list_suffle[renamed.index(imidx)])

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list),
                              key=lambda v: v[2],
                              reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}

    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) /
                                  len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file, 'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
예제 #39
0
 def __init__(self, corpus=DEFAULT_W2V_CORPUS, w2v_name=DEFAULT_W2V, wnid2words_file=DEFAULT_WNID2WORDS_FILE, rootpath=ROOT_PATH):
     word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', w2v_name)
     self.word2vec = BigFile(word2vec_dir)
     self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
     logger.info('w2v(%s): %d words, %d dims', corpus, self.word2vec.shape()[0], self.get_feat_dim())
예제 #40
0
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    nr_bins = options.nr_bins
    best_param_dir = options.best_param_dir
    beta = 0.5
    
    modelName = 'fik%d' % nr_bins
    if best_param_dir:
        modelName += '-tuned'
    
    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0
    
    feat_dir = os.path.join(rootpath,trainCollection,'FeatureData',feature)
    feat_file = BigFile(feat_dir)
    params = {'nr_bins': nr_bins}

    with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f:
        params['min_vals'] = map(float, str.split(f.readline()))
        params['max_vals'] = map(float, str.split(f.readline()))
        
    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))
        
        model_file_name = os.path.join(resultdir, concept + '.model')
        
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
        model = svm_train(y, vectors, svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI"))
        newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s'%model_file_name)
        fiksvm_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fiksvm_load_model(model_file_name)
        assert(abs(newmodel.get_probAB()[0]-A)<1e-6)
        assert(abs(newmodel.get_probAB()[1]-B)<1e-6)

    return len(todo)
예제 #41
0
 def __init__(self, label_source, corpus, word2vec_model, feat_path, rootpath=ROOT_PATH):
     label_vec_path = os.path.join('data', label_source, 'label_vec')
     label_id_file = os.path.join('data', label_source, 'label.txt')
     self.im2vec = Image2Vec(label_id_file, label_vec_path)
     self.qry2vec = Query2Vec(corpus, word2vec_model, rootpath)
     self.img_feats = BigFile(feat_path)
예제 #42
0
def process(options, trainCollection, trainAnnotationName, valCollection,
            valAnnotationName, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}

    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    valConcepts = readConcepts(valCollection,
                               valAnnotationName,
                               rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert (concepts[i] == valConcepts[i])

    resultdir = os.path.join(
        rootpath, trainCollection, 'Models', trainAnnotationName,
        '%s,best_params' % modelName,
        '%s,%s,%s' % (valCollection, valAnnotationName, feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    val_feat_file = BigFile(
        os.path.join(rootpath, valCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims
    assert (feat_dim == val_feat_file.ndims)

    for concept in todo:
        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        names, labels = readAnnotationsFrom(valCollection,
                                            valAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        val_name2label = dict(zip(names, labels))
        val_renamed, val_vectors = val_feat_file.read(names)

        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i]))
                        for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v: v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C

        [A, B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)

        printStatus(
            INFO,
            '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' %
            (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
예제 #43
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma =options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)
    
    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query =  dict(zip(qid_list, query_list))
    
    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")


    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma))

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
             writeRankingResult(ranking_result_path, qid2iid_label_score)
             qid2iid_label_score = {}


    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file,'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
예제 #44
0
class sentence:
    rootpath = ROOT_PATH
    vis_feat = VIS_FEAT
    sent_collection = SENT_COLLECTION
    img_collection = IMG_COLLECTION

    sent_feat_dir = os.path.join(rootpath, sent_collection, "FeatureData", vis_feat)
    sent_id_file = os.path.join(sent_feat_dir, 'id.txt')
    shape_file = os.path.join(sent_feat_dir, 'shape.txt')
    sent_file = os.path.join(rootpath, sent_collection, 'TextData', '%s.txt' % sent_collection)

    def __init__(self, db_file):
        self.nr_of_sents, self.feat_dim = map(int, open(self.shape_file).readline().split())
        self.sent_pool = map(str.strip, open(self.sent_file).readlines())
        self.sent_searcher = load_model(os.path.join(self.sent_feat_dir, 'feature.bin'), self.feat_dim,
                                        self.nr_of_sents, self.sent_id_file)
        self.sent_searcher.set_distance('cosine')
        feat_dir = os.path.join(self.rootpath, self.img_collection, "FeatureData", self.vis_feat)
        self.vis_feat_file = BigFile(feat_dir)
        imageSetFile = open(os.path.join(self.rootpath, self.img_collection, "ImageSets", "%s.txt"%self.img_collection), 'r')
        self.imageSet = imageSetFile.readlines()
        self.db_file = db_file

    def getSentence(self, imageID):
        image = [self.imageSet[imageID].replace("\n", "")]
        renamed, vectors = self.vis_feat_file.read(image)

        result = []
        for i in range(len(renamed)):
            sent_list = self.sent_searcher.search_knn(vectors[i], max_hits=10)
            logger.info('query img', renamed[i])
            for sent_id, distance in sent_list[:5]:
                logger.info(self.sent_pool[int(sent_id[4:])].decode('utf-8'))
                result.extend([self.sent_pool[int(sent_id[4:])].decode('utf-8')])
            print ('')
        return result

    def save_sentence(self, user_id, image_id, submit_time, suggested_sentence, rank, 
                        submitted_sentence, labels, real_id):
        conn = sqlite3.connect(self.db_file)
        conn.text_factory = str
        cursor = conn.execute("SELECT user_id FROM STATE \
                WHERE user_id = %d AND image_id = %d" % (int(user_id), image_id))
        judge = -1
        for row in cursor:
            judge = row[0]
        if judge == -1:
            conn.execute("INSERT INTO STATE (USER_ID, IMAGE_ID, SUBMIT_TIME, SUGGESTED_SENTENCE, RANK, SUBMITTED_SENTENCE, SUBMITTED_LABEL, REAL_IMAGE_ID) \
                VALUES (%d, %d, %f, '%s', %d, '%s', '%s', %d)" % (
                user_id, image_id, submit_time, suggested_sentence.decode('gbk'), rank,
                submitted_sentence.decode('gbk'), labels, real_id))
        else:
            conn.execute("UPDATE STATE SET submit_time=%f, suggested_sentence='%s', rank=%d, submitted_sentence='%s', submitted_label='%s' \
                WHERE user_id = %d AND image_id = %d" % (
                submit_time, suggested_sentence.decode('gbk'), rank, submitted_sentence.decode('gbk'), labels,
                int(user_id),
                image_id))
        conn.commit()
        conn.close()

    def get_sentence(self, user_id, page):
        data = []
        img = image.image(self.db_file)
        conn = sqlite3.connect(self.db_file)
        conn.text_factory = str
        cursor = conn.execute("SELECT count(image_id) FROM STATE WHERE user_id=%d" % user_id)
        for row in cursor:
            count = row[0]
        if (page - 1) * PAGE_LIMIT > count:
            return False, None, None

        cursor = conn.execute("SELECT user_id, image_id, submit_time, suggested_sentence, rank, submitted_sentence, submitted_label FROM STATE \
            WHERE user_id = %d ORDER BY submit_time DESC LIMIT %d OFFSET %d" % (
            user_id, PAGE_LIMIT, (page - 1) * PAGE_LIMIT))

        import userControl as u
        user_control = u.user(self.db_file)

        for row in cursor:
            image_id = row[1]
            #import userControl as u
            #user_control = u.user(self.db_file)

            j, iid, image_id = user_control.getimageid(user_id, image_id)
            url = IMAGE_ROOT + img.getimagename(iid)
            submit_time = row[2]
            suggested_sentence = row[3]
            rank = row[4]
            submitted_sentence = row[5]
            submitted_label = row[6]

            set = {'image_id': image_id, 'url': url, 'submit_time': submit_time,
                   'suggested_sentence': suggested_sentence.encode('gbk'), 'rank': rank,
                   'submitted_sentence': submitted_sentence.encode('gbk'),
                   'submitted_label': submitted_label}

            data = data + [set]
        conn.close()
        import math
        return True, data, math.ceil(float(count) / PAGE_LIMIT)

    def get_sentence_by_imageid(self, user_id, image_id):
        conn = sqlite3.connect(self.db_file)
        conn.text_factory = str
        cursor = conn.execute("SELECT user_id, image_id, submit_time, suggested_sentence, rank, submitted_sentence, submitted_label FROM STATE \
                WHERE user_id = %d AND image_id = %d" % (int(user_id), image_id))
        count = 0
        for row in cursor:
            submitted_label = row[6]
            submitted_sentence = row[5]
            rank = row[4]
            count += 1

        conn.close()

        if count == 0:
            return 0, '', ''
        logger.info(submitted_label)
        return rank, submitted_sentence.encode('gbk'), ', '.join(
            filter(lambda x: x, submitted_label.split(', ')))

    def getNumber(self, user_id):
        conn = sqlite3.connect(self.db_file)
        conn.text_factory = str
        cursor = conn.execute("SELECT user_id, image_id, submit_time, suggested_sentence, rank, submitted_sentence FROM STATE \
            WHERE user_id = %d" % int(user_id))
        number = 0
        for row in cursor:
            number += 1
        return number

    def getAll(self, start, end):
        user_control = userControl.user(self.db_file)

        conn = sqlite3.connect(self.db_file)
        conn.text_factory = str
        cursor = conn.execute("SELECT user_id, count(image_id) FROM STATE WHERE SUBMIT_TIME < %f AND SUBMIT_TIME > %f \
                              GROUP BY user_id" % (end, start))
        logger.info("SELECT user_id, count(image_id) FROM STATE WHERE SUBMIT_TIME < %f AND SUBMIT_TIME > %f GROUP BY user_id" % (end, start))

        data = []
        for row in cursor:
            user_id = row[0]
            count = row[1]
            set = [user_id, count, user_control.getusername(user_id)]
            data = data + [set]
        return data

    def get_image_info(self, image_id):
        user_control = userControl.user(self.db_file)

        conn = sqlite3.connect(self.db_file)
        cursor = conn.execute(
            "SELECT user_id, submitted_sentence, submitted_label FROM STATE WHERE real_image_id=%d" % image_id)
        data = []
        for row in cursor:
            user_id = row[0]
            submitted_sentence = row[1]
            submitted_label = row[2]
            set = [user_control.getusername(user_id), submitted_sentence, submitted_label]
            data = data + [set]

        return data
예제 #45
0
 def __init__(self, corpus, modelName, rootpath=ROOT_PATH):
     word2vec_dir = os.path.join(rootpath, "word2vec", corpus, modelName)
     self.word2vec = BigFile(word2vec_dir)
예제 #46
0
파일: tagger.py 프로젝트: silasxue/hierse
if __name__ == '__main__':
    rootpath = ROOT_PATH

    embedding_model = 'hierse2'
    embedding_name = 'flickr4m,tagvec500,%s' % embedding_model
    tagger = ZeroshotTagger(embedding_name = embedding_name)
    label_file = 'data/ilsvrc12/synsets.txt'
    label2vec_dir = os.path.join(rootpath, 'synset2vec', 'imagenet1k', embedding_name)
    from im2vec import Image2Vec
    i2v = Image2Vec(label_file, label2vec_dir)

    from basic.util import readImageSet
    testCollection = 'imagenet2hop'
    imset = readImageSet(testCollection, 'random100k', rootpath)
    feature = 'dascaffeprob'
    feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))

    blocksize = 1000
    start = 0

    from eval import HitScorer

    scorers = [HitScorer(n) for n in [1, 2, 5, 10]]
    overall_perf = [0.0] * len(scorers)
    nr_of_images = 0

    while start < len(imset):
        end = min(len(imset), start + blocksize)
        renamed, vectors = feat_file.read(imset[start:end])

        for _id,_vec in zip(renamed, vectors):
예제 #47
0
파일: hiksvm.py 프로젝트: Peratham/jingwei
    from basic.annotationtable import readAnnotationsFrom
    from simpleknn.bigfile import BigFile
    
    ROOT_PATH = '/home/root123/xirong/VisualSearch'
    rootpath = ROOT_PATH
    trainCollection = 'flickr81train'
    trainAnnotationName = 'concepts81train.random50.0.random50.0.txt'
    testCollection = "flickr81test"
    testAnnotationName = 'conceptsflickr81test.txt'
    feature = "dascaffeprob"
    feat_dim = 1000
    scorer = getScorer("AP")
    
    targetConcept = sys.argv[1] #"aeroplane"

    train_feat_file = BigFile(os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim)
    test_feat_file = BigFile(os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim)
    testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000)
    
    minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
    with open(minmax_file, 'r') as f:
        min_vals = map(float, str.split(f.readline()))
        max_vals = map(float, str.split(f.readline()))


    [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath)
    name2label = dict(zip(names,labels))
    (renamed, vectors) = train_feat_file.read(names)
    relabeled = [name2label[x] for x in renamed] #label is either 1 or -1
    
    [names,labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath)
예제 #48
0
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}
    
    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'


    
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert(concepts[i] == valConcepts[i])
    
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims
    assert(feat_dim == val_feat_file.ndims)

    
    for concept in todo:
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath)
        val_name2label = dict(zip(names,labels))
        val_renamed, val_vectors = val_feat_file.read(names)
        
        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v:v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C
                
        [A,B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        
        printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
예제 #49
0
class ImageSimer:
    def __init__(self, dev_feat_path, train_feat_path):
        self.dev_feats = BigFile(dev_feat_path)
        self.train_feats =  BigFile(train_feat_path)


    def calsimImage(self, img, imgs):
        imgfeat = self.dev_feats.read_one(img)

        renamed, test_X = self.train_feats.read(imgs)
        resorted_feats = [None] * len(renamed)
        for i in xrange(len(renamed)):
            idx = imgs.index(renamed[i])
            resorted_feats[idx] = test_X[i]

        return calImageSimiByCos( imgfeat, resorted_feats)


    def calsimiImagewithClick(self, img, img_click_list, clickthres):
        
        imgfeat = self.dev_feats.read_one(img)

        img_list = [x[0] for x in img_click_list if int(x[1]) >= clickthres]
        clc_list = [int(x[1]) for x in img_click_list if int(x[1]) >= clickthres]
        assert (len(img_list) == len(clc_list))

        renamed, test_X = self.train_feats.read(img_list)

        # re-sort the label list according to the renamed
        resorted_feats = [None] * len(renamed)
        for i in xrange(len(renamed)):
            idx = img_list.index(renamed[i])
            resorted_feats[idx] = test_X[i]

        img_simi = calImageSimiByCos( imgfeat, resorted_feats)

        return sum(np.array(img_simi) * np.log(clc_list)) / len(img_list)


    def clasimiImgwithWeightImgs(self, img, imgs, weightes):

        assert(len(imgs) == len(weightes))
        imgfeat = self.dev_feats.read_one(img)
        renamed, feats = self.train_feats.read(imgs)
        
        # re-sort the label list according to the renamed
        resorted_weight = [None] * len(weightes)
        for i in xrange(len(renamed)):
            idx = imgs.index(renamed[i])
            resorted_weight[i] = weightes[idx]

        simi_list = calImageSimiByCos(imgfeat, feats)

        normal_weight = np.array(resorted_weight) / sum(resorted_weight)

        score = np.dot(normal_weight, np.array(simi_list) )

        return score


    def simiImgs_WeightImgs(self, t_img_list, s_img_list, weightes):
        assert(len(s_img_list) == len(weightes))

        t_renamed, t_feats = self.dev_feats.read(t_img_list)
        s_renamed, s_feats = self.train_feats.read(s_img_list)

        # re-sort the label list according to the renamed
        resorted_weight = [None] * len(weightes)
        for i in xrange(len(s_renamed)):
            idx = s_img_list.index(s_renamed[i])
            resorted_weight[i] = weightes[idx]
        normal_weight = np.array(resorted_weight) / sum(resorted_weight)

        cosineSimi = -(distance.cdist(t_feats, s_feats, 'cosine')-1)
        weightSimi = np.dot(cosineSimi, normal_weight)

        renamed2sim = dict(zip(t_renamed, list(weightSimi)))
        final_score = []
        for key in t_img_list:
            final_score.append(renamed2sim[key])
        return final_score
예제 #50
0
    testAnnotationName = 'conceptsvoc2008val.txt'

    feature = 'dsift'
    modelName = 'fastlinear'
    modelName = 'fik50'
    metric = 'AP'
    scorer = getScorer(metric)


    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    test_imset = readImageSet(testCollection, testCollection, rootpath=rootpath)
    test_feat_file = BigFile(os.path.join(rootpath,testCollection,'FeatureData',feature))
    test_renamed, test_vectors = test_feat_file.read(test_imset)

    concepts = readConcepts(testCollection, testAnnotationName, rootpath=rootpath)

    print ('### %s' % os.path.join(trainCollection, 'Models', trainAnnotationName, feature, modelName))
    results = []

    for concept in concepts:
        model_file_name = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName, '%s.model' % concept)
        model = load_model(model_file_name)

        ranklist = [(test_renamed[i], model.predict(test_vectors[i])) for i in range(len(test_renamed))]
        ranklist.sort(key=lambda v:v[1], reverse=True)

        names,labels = readAnnotationsFrom(testCollection, testAnnotationName, concept, skip_0=True, rootpath=rootpath)
예제 #51
0
파일: irc_image.py 프로젝트: danieljf/cmrf
 def __init__(self, dev_feat_path, train_feat_path):
     self.dev_feats = BigFile(dev_feat_path)
     self.train_feats =  BigFile(train_feat_path)
예제 #52
0
파일: test_all.py 프로젝트: silasxue/hierse
    def test_tagging(self):
        corpus = 'flickr4m'
        word2vec_model = 'tagvec500'
        testCollection = 'imagenet2hop-random2k'
        imset = readImageSet(testCollection, testCollection, rootpath)
        feature = 'dascaffeprob'

        feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))
        blocksize = 1000
        scorers = [HitScorer(n) for n in [1, 2, 5, 10]]

        overwrite = 1

        for embedding_model in str.split('conse conse2 hierse hierse2'):
            embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model)

            for synset_name in str.split('imagenet1k imagenet1k2hop'):
                if 'imagenet1k' == synset_name:
                    label_file = 'data/ilsvrc12/synsets.txt'
                else:
                    label_file = 'data/ilsvrc12/synsets2hop.txt'

                params = '%s %s --embedding %s --word2vec %s --corpus %s --overwrite %d' % (label_file, synset_name, embedding_model, word2vec_model, corpus, overwrite)
                os.system('python build_synset_vec.py %s' % params)
                shape_file = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name, 'shape.txt')
                self.assertTrue(os.path.exists(shape_file), msg="%s is not ready" % synset_name)

    
            synset_name = 'imagenet1k'
            label_file = 'data/ilsvrc12/synsets.txt'
            label2vec_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)
            i2v = Image2Vec(label_file, label2vec_dir)

            tagger = ZeroshotTagger(embedding_name = embedding_name)
            printStatus(INFO, 'tagging %d images' % len(imset))

            start = 0

    
            overall_perf = [0.0] * len(scorers)
            nr_of_images = 0

            while start < len(imset):
                end = min(len(imset), start + blocksize)
                renamed, vectors = feat_file.read(imset[start:end])

                for _id,_vec in zip(renamed, vectors):
                    truth = set([_id.split('_')[0]])
                    im_vec = i2v.embedding(_vec)
                    pred = tagger.predict(im_vec)
                    sorted_labels = [int(x[0] in truth) for x in pred]
                    perf = [scorer.score(sorted_labels) for scorer in scorers]
                    overall_perf = [overall_perf[i] + perf[i] for i in range(len(scorers))]
                    nr_of_images += 1

                start = end
    
            res = [x/nr_of_images for x in overall_perf]
            print '_'*100
            print embedding_name
            print ' '.join([x.name() for x in scorers])
            print ' '.join(['%.3f' % x for x in res])
            print '_'*100
예제 #53
0
    testset = testCollection
    testAnnotationName = 'conceptsvoc2008val.txt'

    modelName = 'fik50' 
    #modelName = 'fastlinear'
    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model

    scorer = getScorer(metric)
    
    imset = readImageSet(testCollection,testset,rootpath=rootpath)
    concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath)
    feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature)
    feat_file = BigFile(feat_dir)

    _renamed, _vectors = feat_file.read(imset)

    nr_of_images = len(_renamed)
    nr_of_concepts = len(concepts)
    
    mAP = 0.0
    models = [None] * len(concepts)

    stream = StreamFile(feat_dir)

    for i,concept in enumerate(concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concept)
        model = load_model(model_file_name)
        #print model.get_probAB()
예제 #54
0
 def __init__(self, corpus, modelName, wnid2words_file='data/wnid2words.pkl', rootpath=ROOT_PATH):
     printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...')
     word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName)
     self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
     self.word2vec = BigFile(word2vec_dir)