Exemplo n.º 1
0
def process(options, collection, annotationName, pos_num):
    assert(annotationName.endswith('.txt'))
    rootpath = options.rootpath
    pos_bag_num = options.pos_bag_num
    neg_bag_num = options.neg_bag_num
    neg_pos_ratio = options.neg_pos_ratio

    annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt'

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    
    skip = 0
    newAnnotationNames = [None] * (pos_bag_num * neg_bag_num)

    for idxp in range(pos_bag_num):
        for idxn in range(neg_bag_num):
            anno_idx = idxp * neg_bag_num + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, options.overwrite):
                skip += 1
                continue
            writeConcepts(concepts,resultfile)

    first,second,last = annotationNameStr.split('%d')
    scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last)

    makedirsforfile(scriptfile)
    fout = open(scriptfile,'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if len(newAnnotationNames) == skip:
        return 0
        
    for concept in concepts:
        names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath)
        positivePool = [x[0] for x in zip(names,labels) if x[1]>0]
        negativePool = [x[0] for x in zip(names,labels) if x[1]<0]
        
        for idxp in range(pos_bag_num):
            if len(positivePool) > pos_num:
                positiveBag = random.sample(positivePool, pos_num)
            else:
                positiveBag = positivePool
            for idxn in range(neg_bag_num):
                anno_idx = idxp * neg_bag_num + idxn
                newAnnotationName = newAnnotationNames[anno_idx]
                resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept)
                if checkToSkip(resultfile, options.overwrite):
                    continue
                real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000)
                real_neg_num = min(len(negativePool), real_neg_num)
                negativeBag = random.sample(negativePool, real_neg_num)

                assert(len(set(positiveBag).intersection(set(negativeBag))) == 0)
                printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile))
                writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
Exemplo n.º 2
0
def process(options, collection, conceptfile):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')]
    resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp)

    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, '%s.txt'%concept)
        if checkToSkip(resultfile, overwrite):
            continue
        todo.append(concept)

    if not todo:
        printStatus(INFO, 'nothing to do')
        return 0

    try:
        holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt')
        holdoutSet = set(map(str.strip,open(holdoutfile).readlines()))
    except:
        holdoutSet = set()

    hitlists = buildHitlists(collection, todo, tpp, rootpath)
    min_hit = 1e6
    max_hit = 0

    for concept in todo:
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        if checkToSkip(resultfile,overwrite):
            continue
        subconcepts = concept.split('-')
        labeledSet = set(hitlists[subconcepts[0]])
        for i in range(1,len(subconcepts)):
            labeledSet = labeledSet.intersection(hitlists[subconcepts[i]])
        labeledSet = labeledSet.difference(holdoutSet)
        if len(labeledSet) == 0:
            printStatus(INFO, '%s has ZERO hit' % concept)
        else:
            printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile))
            makedirsforfile(resultfile)
            fw = open(resultfile, 'w')
            fw.write('\n'.join(labeledSet) + '\n')
            fw.close()
        if len(labeledSet) > max_hit:
            max_hit = len(labeledSet)
        if len(labeledSet) < min_hit:
            min_hit = len(labeledSet)
            
    printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
Exemplo n.º 3
0
def process(options, collection, conceptfile):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')]
    resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp)

    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, '%s.txt'%concept)
        if checkToSkip(resultfile, overwrite):
            continue
        todo.append(concept)

    if not todo:
        print ('nothing to do')
        return 0

    try:
        holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt')
        holdoutSet = set(map(str.strip,open(holdoutfile).readlines()))
    except:
        holdoutSet = set()

    hitlists = buildHitlists(collection, todo, tpp, rootpath)
    min_hit = 1e6
    max_hit = 0

    for concept in todo:
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        if checkToSkip(resultfile,overwrite):
            continue
        subconcepts = concept.split('-')
        labeledSet = set(hitlists[subconcepts[0]])
        for i in range(1,len(subconcepts)):
            labeledSet = labeledSet.intersection(hitlists[subconcepts[i]])
        labeledSet = labeledSet.difference(holdoutSet)
        if len(labeledSet) == 0:
            printStatus(INFO, '%s has ZERO hit' % concept)
        else:
            printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile))
            makedirsforfile(resultfile)
            fw = open(resultfile, 'w')
            fw.write('\n'.join(labeledSet) + '\n')
            fw.close()
        if len(labeledSet) > max_hit:
            max_hit = len(labeledSet)
        if len(labeledSet) < min_hit:
            min_hit = len(labeledSet)
            
    printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
Exemplo n.º 4
0
def process(options):

    overwrite = options.overwrite
    inputeFile = options.inputeFile
    weightFile = options.weightFile

    weightFile = os.path.join('result', weightFile)
    if checkToSkip(weightFile, overwrite):
        sys.exit(0)
    makedirsforfile(weightFile)

    test()
    print '-' * 70
    best_perf = -10
    best_alpha = None

    sigma = 0.001
    data = load_data(os.path.join('result', inputeFile))
    for i in range(1):
        perf, alpha = coordinate_ascent(data, sigma)
        if perf > best_perf:
            best_perf = perf
            best_alpha = alpha
        print '*' * 70
    print 'optimized wights:', ' '.join(['%g' % x for x in best_alpha])
    print 'best tuned performance:', best_perf

    open(weightFile, 'w').write(' '.join(map(str, best_alpha)))
    print 'optimized wight parameters have written into %s' % weightFile
Exemplo n.º 5
0
def process(options):

    overwrite = options.overwrite
    inputeFile = options.inputeFile
    weightFile = options.weightFile


    weightFile = os.path.join('result', weightFile)
    if checkToSkip(weightFile, overwrite):
        sys.exit(0)
    makedirsforfile(weightFile)

    test()
    print '-'*70
    best_perf = -10
    best_alpha = None
    
    sigma = 0.001
    data = load_data(os.path.join('result', inputeFile))
    for i in range(1):
        perf, alpha = coordinate_ascent(data, sigma)
        if perf > best_perf:
            best_perf = perf
            best_alpha = alpha
        print '*'*70
    print 'optimized wights:', ' '.join(['%g'%x for x in best_alpha])
    print 'best tuned performance:', best_perf

    open(weightFile, 'w').write(' '.join(map(str,best_alpha)))
    print 'optimized wight parameters have written into %s' % weightFile
Exemplo n.º 6
0
def process(options, tagfile, tpp):
    if "stem" == tpp:
        worker = nltk.PorterStemmer()
        func = stemming
    else:
        worker = nltk.WordNetLemmatizer()
        func = lemmatize

    resultfile = os.path.join(os.path.split(tagfile)[0], 'id.userid.%stags.txt' % tpp)
    if checkToSkip(resultfile, options.overwrite):
        return 0

    makedirsforfile(resultfile)

    fw = codecs.open(resultfile, "w", encoding='utf8')
    parsed = 0
    obtained = 0
    for line in open(tagfile):
        elems = line.strip().split()
        parsed += 1
        if len(elems) > 2:
            newtags = []
            for tag in elems[2:]:
                try:
                    newtag = func(worker,tag.lower())
                except:
                    newtag = tag
                
                newtags.append(newtag.decode('utf-8'))

            newline = "\t".join([elems[0], elems[1], " ".join(newtags)])
            fw.write('%s\n' % newline)
            obtained += 1
    fw.close()
    print ('%d lines parsed, %d records obtained' % (parsed, obtained) )
Exemplo n.º 7
0
def process(options, conceptfile, tagvotesfile, resultfile):
    if checkToSkip(resultfile, options.overwrite):
        return 0
    
    concepts = map(str.strip, open(conceptfile).readlines())
    concept2index = dict(zip(concepts,range(len(concepts))))
    
    data = open(tagvotesfile).readlines()
    print ('%d instances to dump' % len(data))
    
    concept_num = len(concepts)
    image_num = len(data)
    scores = np.zeros((image_num, concept_num)) - 1e4
    id_images = [None] * image_num
    
    for i in xrange(image_num):
        elems = str.split(data[i])
        id_images[i] = int(elems[0])
        del elems[0]
        for k in range(0, len(elems), 2):
            tag = elems[k]
            score = float(elems[k+1])
            j = concept2index.get(tag, -1)
            if j >= 0:
                scores[i,j] = score
    
    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':scores}, output, -1)
    output.close()
Exemplo n.º 8
0
def process(opt):

    rootpath = opt.rootpath
    collection = opt.collection
    feature = opt.feature
    overwrite = opt.overwrite

    feat_path = os.path.join(rootpath, collection, "FeatureData", feature)
    result_file = os.path.join(feat_path, "video2frames.txt")
    if checkToSkip(result_file, overwrite):
        sys.exit(0)
    makedirsforfile(result_file)

    feat_data = BigFile(feat_path)
    video2fmnos = {}
    int2str = {}
    for frame_id in feat_data.names:
        data = frame_id.strip().split("_")
        #print data
        video_id = '_'.join(data[:-1])
        fm_no = data[-1]
        video2fmnos.setdefault(video_id, []).append(int(fm_no))
        if int(fm_no) not in int2str:
            int2str[int(fm_no)] = fm_no

    video2frames = {}
    for video_id, fmnos in video2fmnos.iteritems():
        for fm_no in sorted(fmnos):
            video2frames.setdefault(video_id,
                                    []).append(video_id + "_" + int2str[fm_no])

    write_dict(result_file, video2frames)
    print "write out into: ", result_file
Exemplo n.º 9
0
def process(options, collection, annotationName, simdir, resultfile):
    rootpath = options.rootpath

    if checkToSkip(resultfile, options.overwrite):
        return 0

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    concept_num = len(concepts)

    id_images = readImageSet(collection, collection, rootpath)
    image_num = len(id_images)
    im2index = dict(zip(id_images, range(image_num)))
    print('%d instances, %d concepts to dump -> %s' %
          (image_num, concept_num, resultfile))

    scores = np.zeros((image_num, concept_num)) - 1e4

    for c_id, concept in enumerate(concepts):
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        for im, score in ranklist:
            idx = im2index[im]
            scores[idx, c_id] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump(
        {
            'concepts': concepts,
            'id_images': map(int, id_images),
            'scores': scores
        }, output, -1)
    output.close()
Exemplo n.º 10
0
def process(options, conceptfile, tagvotesfile, resultfile):
    if checkToSkip(resultfile, options.overwrite):
        return 0

    concepts = map(str.strip, open(conceptfile).readlines())
    concept2index = dict(zip(concepts, range(len(concepts))))

    data = open(tagvotesfile).readlines()
    print('%d instances to dump' % len(data))

    concept_num = len(concepts)
    image_num = len(data)
    scores = np.zeros((image_num, concept_num)) - 1e4
    id_images = [None] * image_num

    for i in xrange(image_num):
        elems = str.split(data[i])
        id_images[i] = elems[0]  #int(elems[0])
        del elems[0]
        for k in range(0, len(elems), 2):
            tag = elems[k]
            score = float(elems[k + 1])
            j = concept2index.get(tag, -1)
            if j >= 0:
                scores[i, j] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump(
        {
            'concepts': concepts,
            'id_images': id_images,
            'scores': scores
        }, output, -1)
    output.close()
Exemplo n.º 11
0
def process(options, source_dir, feat_dim, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, 'feature.bin')
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    featurefile = BigFile(source_dir, feat_dim)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, 'wb')

    done = []
    start = 0
  
    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))
        renamed, vectors = featurefile.read(imset[start:end])
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert(len(done) == len(set(done)))
    resultfile = os.path.join(result_dir, 'id.txt')
    fw = open(resultfile, 'w')
    fw.write(' '.join(done))
    fw.close()

    print '%d requested, %d obtained' % (len(imset), len(done))
Exemplo n.º 12
0
def process(options):

    overwrite = options.overwrite
    inputeFile = options.inputeFile
    weightFile = options.weightFile
    resultFile = options.resultFile

        
    weightFile = os.path.join('result', weightFile)
    weight = open(weightFile).readline().strip().split()
    weight = np.array(weight, dtype=np.float)

    resultFile = os.path.join('result', resultFile)
    if checkToSkip(resultFile, overwrite):
        sys.exit(0)
    fout = open(resultFile, 'w')

    done = 0
    for line in open(os.path.join('result', inputeFile)):
        elems = line.strip().split()
        vecs = map(float, elems[3:])
        vecs = np.array(vecs, dtype=np.float)
        assert(len(weight) == len(vecs))

        fout.write(" ".join(elems[:2]) + " " + str(np.dot(weight, vecs)) + '\n')

        done += 1
        if done % 10000 == 0:
            print done, 'Done'

    fout.close()
    print "final score result after relevance fusion have written in %s" % resultFile
Exemplo n.º 13
0
def process(options, collection, annotationName, simdir, resultfile):
    rootpath = options.rootpath
    
    if checkToSkip(resultfile, options.overwrite):
        return 0
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    concept_num = len(concepts)

    id_images = readImageSet(collection, collection, rootpath)
    image_num = len(id_images)
    im2index = dict(zip(id_images, range(image_num)))
    print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile))
    
    scores = np.zeros((image_num, concept_num)) - 1e4
    
    for c_id,concept in enumerate(concepts):
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        for im,score in ranklist:
            idx = im2index[im]
            scores[idx,c_id] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1)
    output.close()
def main(option):
    rootpath = option.rootpath
    collection = option.collection
    threshold = option.threshold
    text_style = option.text_style

    vocab_file = os.path.join(rootpath, collection, 'TextData', 'vocabulary',
                              text_style, 'word_vocab_%d.pkl' % threshold)
    counter_file = os.path.join(os.path.dirname(vocab_file),
                                'word_vocab_counter_%s.txt' % threshold)

    if checkToSkip(vocab_file, option.overwrite):
        sys.exit(0)
    makedirsforfile(vocab_file)

    vocab, word_counter = build_vocab(collection,
                                      text_style,
                                      threshold=threshold,
                                      rootpath=rootpath)
    with open(vocab_file, 'wb') as writer:
        pickle.dump(vocab, writer, pickle.HIGHEST_PROTOCOL)
    logger.info("Saved vocabulary file to %s", vocab_file)

    word_counter = [(word, cnt) for word, cnt in word_counter.items()
                    if cnt >= threshold]
    word_counter.sort(key=lambda x: x[1], reverse=True)
    with open(counter_file, 'w') as writer:
        writer.write('\n'.join(map(lambda x: x[0] + ' %d' % x[1],
                                   word_counter)))
    logger.info("Saved vocabulary counter file to %s", counter_file)
Exemplo n.º 15
0
def process(options, collection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite
    neg_filter = options.neg_filter
    
    concepts = readConcepts(collection, annotationName, rootpath)
    newAnnotationName = annotationName[:-4] + 'social.txt'
    ne = STRING_TO_NEGATIVE_ENGINE[neg_filter](collection, rootpath)

    newConcepts = []
    for concept in concepts:
        resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept)
        if checkToSkip(resultfile, overwrite):
            newConcepts.append(concept)
            continue

        try:
            pos_set = readLabeledImageSet(collection, concept, tpp='lemm', rootpath=rootpath)
        except:
            pos_set = None 
        if not pos_set:
            printStatus(INFO, '*** %s has not labeled examples, will be ignored ***' % concept)
            continue
        neg_set = ne.sample(concept, int(1e8))
        assert(len(set(pos_set).intersection(set(neg_set))) == 0)
        newlabels = [1] * len(pos_set) + [-1] * len(neg_set)
        newnames = pos_set + neg_set
        printStatus(INFO, "anno(%s) %d pos %d neg -> %s" % (concept,len(pos_set),len(neg_set),resultfile))
        writeAnnotations(newnames, newlabels, resultfile)
        newConcepts.append(concept)

    writeConceptsTo(newConcepts, collection, newAnnotationName, rootpath)
Exemplo n.º 16
0
def process(options, testCollection, trainCollection, tagsimMethod):
    rootpath = options.rootpath
    overwrite = options.overwrite
    testsetName = options.testset if options.testset else testCollection 
    tpp = options.tpp
    numjobs = options.numjobs
    job = options.job
    useWnVob = 1

    outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod

    if tagsimMethod == 'wns':
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt')
    else:    
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt')
    if numjobs>1:
        resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job))

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    makedirsforfile(resultfile)

    try:
        doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()
        
    printStatus(INFO, "done set: %d" % len(doneset))

 
    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job]
    printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) )
    
    testreader = TagReader(testCollection, rootpath=rootpath)    

    if tagsimMethod == "wns":
        tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath)
    else:
        tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath)

 
    done = 0
    fw = open(resultfile, "w")
    
    for qry_id in testImageSet:
        qry_tags = testreader.get(qry_id)    
        tagvotes = tagrel.estimate(qry_tags)
        newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes])
        fw.write(newline+"\n")
        done += 1
        if done%1000 == 0:
            printStatus(INFO, "%d done" % done)
    # done    
    fw.close()
    printStatus(INFO, "%d done" % done)
Exemplo n.º 17
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    assert(modelName.startswith('fastlinear'))
    
    rootpath = options.rootpath
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job
    topk = options.topk
    
    outputName = '%s,%s' % (feature,modelName)
    
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath)
        
    feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue
       
        res = ma.predict([_vec],prob=0)
        tagvotes = res[0]
        if topk>0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4  == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done
Exemplo n.º 18
0
def process(options, testCollection):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding_model = options.embedding
    Y0 = options.Y0
    Y1 = options.Y1
    pY0 = options.pY0
    r = options.r
    blocksize = 2000

    embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model)
    for synset_name in [Y0, Y1]:
        assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)))

    resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    label_file = 'data/ilsvrc12/synsets.txt'
    label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name)
    i2v = Image2Vec(label_file, label2vec_dir)

    tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath)

    imset = readImageSet(testCollection, testCollection, rootpath)
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0)
    feat_file = BigFile(feat_dir)
    

    printStatus(INFO, 'tagging %d images' % len(imset))
    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    start = 0
    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end))
        todo = imset[start:end]
        if not todo:
            break

        renamed, vectors = feat_file.read(todo)
        output = []
        for _id,_vec in zip(renamed, vectors):
            im_vec = i2v.embedding(_vec)
            pred = tagger.predict(im_vec, topk=options.r)
            output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred])))
        start = end
        fw.write(''.join(output))

    fw.close()
Exemplo n.º 19
0
def process(options, trainCollection, baseAnnotationName, startAnnotationName, feature, modelName):
    global train_model, compress_model, save_model
    assert(modelName in ['fik', 'fastlinear'])
    if 'fik' == modelName:
        from model_based.svms.fiksvm.svmutil import svm_train as train_model
        from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model
    else:
        from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model


    rootpath = options.rootpath
    overwrite = options.overwrite
    params = {'rootpath': rootpath, 'trainCollection': trainCollection, 'baseAnnotationName': baseAnnotationName,
              'startAnnotationName': startAnnotationName, 'feature': feature, 'model': modelName, 'strategy': options.strategy,
              'iterations': options.iterations, 'npr': options.npr, 'nr_bins': options.nr_bins}

    concepts = readConcepts(trainCollection, startAnnotationName, rootpath)
    newAnnotationName = get_new_annotation_name(params)
    newModelName = get_model_name(params)
    modeldir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, newModelName)
    todo = [concept for concept in concepts if overwrite or os.path.exists(os.path.join(modeldir,'%s.txt'%concept)) is False]
    activeConcepts = [todo[i] for i in range(len(todo)) if (i%options.numjobs+1) == options.job]

    params['feat_file'] = BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature))

    if 'fik' == modelName:
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    

        
    s_time = time.time()

    for concept in activeConcepts:
        printStatus(INFO, 'processing %s' % concept)
        modelfile = os.path.join(modeldir, '%s.model'%concept)
        if checkToSkip(modelfile, overwrite):
            continue
        new_model = NegativeBootstrap.learn(concept, params)
        makedirsforfile(modelfile)
        printStatus(INFO, 'save model to %s' % modelfile)
        save_model(modelfile, new_model)
        printStatus(INFO, '%s done' % concept)
        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(activeConcepts), ' '.join(activeConcepts)))
    printStatus(INFO, 'models stored at %s' % modeldir)
    printStatus(INFO, '%g seconds in total' % timecost)
Exemplo n.º 20
0
def process(options, trainCollection, annotationName, testCollection):
    rootpath = options.rootpath
    m = options.m
    k_r = options.kr
    k_d = options.kd
    k_s = options.ks
    k_c = options.kc
    feature = options.feature
    add_bonus = options.bonus
    overwrite = options.overwrite
    
    #outputName = 'cotag,m%d,kr%d,kd%d,ks%d,kc%d,bonus%d'%(m,k_r,k_d,k_s,k_c,add_bonus)
    outputName = 'cotag' # simplify the outputName to reduce the length of the result filename
    outputName = os.path.join(outputName, feature) if (k_c>1e-6) else outputName
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, annotationName, outputName, 'id.tagvotes.txt')

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)
     
    testImageSet = readImageSet(testCollection, testCollection, rootpath=rootpath)
    test_tag_reader = TagReader(testCollection, rootpath=rootpath)
    
    if k_c < 1e-6:
        tagger = TagCooccurTagger(testCollection, trainCollection, annotationName, rootpath=rootpath)
    else:
        tagger = TagCooccurPlusTagger(testCollection, trainCollection, annotationName, feature=feature, rootpath=rootpath)
    tagger.m = m
    tagger.k_r = k_r
    tagger.k_d = k_d
    tagger.k_s = k_s
    tagger.k_c = k_c
    tagger.add_bonus = add_bonus
    
    makedirsforfile(resultfile)
    
    fw = open(resultfile, 'w')
    
    output = []
    done = 0
    for im in testImageSet:
        user_tags = test_tag_reader.get(im)
        tagvotes = tagger.predict(content=im, context=user_tags)
        newline = '%s %s' % (im, ' '.join(['%s %s'%(x[0], niceNumber(x[1],6)) for x in tagvotes]))
        output.append(newline)
        done += 1
        if len(output) % 1e4 == 0:
            fw.write('\n'.join(output) + '\n')
            output=[]
            printStatus(INFO, '%d done' % done)
    if output:
        fw.write('\n'.join(output) + '\n')
    fw.close()
    printStatus(INFO, '%d done' % done)
Exemplo n.º 21
0
def process(options, collection):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    resultfile = os.path.join(rootpath, collection, "tagrel", collection,
                              'tagpos,%s' % tpp, 'id.tagvotes.txt')
    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    imset = readImageSet(collection, collection, rootpath)
    printStatus(INFO,
                'working on %d test images -> %s' % (len(imset), resultfile))

    reader = TagReader(collection, tpp=tpp, rootpath=rootpath)

    makedirsforfile(resultfile)
    fw = open(resultfile, "w")
    output = []
    done = 0

    for im in imset:
        tags = reader.get(im)
        tagSet = set()
        tagSeq = []
        for tag in str.split(tags):
            if tag not in tagSet:
                tagSeq.append(tag)
                tagSet.add(tag)
        assert (len(tagSeq) == len(tagSet))

        nr_tags = len(tagSeq)
        tagvotes = [(tagSeq[i], 1.0 - float(i) / nr_tags)
                    for i in range(nr_tags)]
        newline = "%s %s" % (im, " ".join(
            ["%s %g" % (x[0], x[1]) for x in tagvotes]))
        output.append(newline + "\n")
        done += 1

        if len(output) % 1e4 == 0:
            printStatus(
                INFO, '%d %s %s' % (done, im, ' '.join(
                    ['%s:%g' % (x[0], x[1]) for x in tagvotes[:3]])))
            fw.write("".join(output))
            fw.flush()
            output = []

    if output:
        fw.write("".join(output))
    fw.close()
    printStatus(INFO, 'done')
Exemplo n.º 22
0
def submit(searchers, collection,annotationName, rootpath=ROOT_PATH, overwrite=0):
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_runs = len(searchers)

    for concept in concepts:
        for j in range(nr_of_runs):
            resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt")
            if checkToSkip(resultfile, overwrite):
                continue
            searchresults = searchers[j].scoreCollection(concept)
            print ("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile))
            writeRankingResults(searchresults, resultfile)

    printStatus('%s.submit'%os.path.basename(__file__), "done")
Exemplo n.º 23
0
def process(options, testCollection, annotationName, tagvotefile):
    rootpath = options.rootpath
    tpp = options.tpp
    tagged = options.tagged
    overwrite = options.overwrite

    resultdir = generate_result_dir(options, testCollection, tagvotefile)
    
    concepts = readConcepts(testCollection, annotationName, rootpath)
    todo = []
    for concept in concepts:
        resfile = os.path.join(resultdir, '%s.txt'%concept)
        if checkToSkip(resfile, overwrite):
            continue
        todo.append(concept)

    if not todo:
        print ('nothing to do')
        return 0

    nr_of_concepts = len(todo)
    labeled_set = [None] * nr_of_concepts
    if tagged:
        for i in range(nr_of_concepts):
            labeled_set[i] = set(readLabeledImageSet(testCollection, todo[i], tpp, rootpath))
        
    concept2index = dict(zip(todo, range(nr_of_concepts)))
    ranklists = [[] for i in range(nr_of_concepts)]

    for line in open(tagvotefile):
        elems = line.strip().split()
        imageid = elems[0]
        del elems[0]
        assert(len(elems)%2==0)

        for i in range(0, len(elems), 2):
            tag = elems[i]
            c = concept2index.get(tag, -1)
            if c >= 0:
                if tagged and imageid not in labeled_set[c]:
                    continue
                score = float(elems[i+1])
                ranklists[c].append((imageid,score))

    for i in range(nr_of_concepts):
        concept = todo[i]
        resfile = os.path.join(resultdir, '%s.txt'%concept)
        ranklist = sorted(ranklists[i], key=lambda v:v[1], reverse=True)
        print ('%s %d -> %s' % (concept, len(ranklist), resfile))
        writeRankingResults(ranklist, resfile)
Exemplo n.º 24
0
def process(options, label_file, label2vec_dir, testCollection, feature, new_feature):
    rootpath = options.rootpath
    overwrite = options.overwrite
    k = options.k
    blocksize = options.blocksize
    subset = options.subset if options.subset else testCollection

    resfile = os.path.join(rootpath, testCollection, 'FeatureData', new_feature, 'id.feature.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    imsetfile = os.path.join(rootpath, testCollection, 'ImageSets', '%s.txt' % subset)
    imset = map(str.strip, open(imsetfile).readlines())
    printStatus(INFO, '%d images to do' % len(imset))

    feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))

    im2vec = Image2Vec(label_file, label2vec_dir)


    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    read_time = 0
    run_time = 0
    start = 0
    done = 0

    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed, test_X = feat_file.read(imset[start:end])
        read_time += time.time() - s_time
        
        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            vec = im2vec.embedding(test_X[i], k)
            output[i] = '%s %s\n' % (renamed[i], " ".join([niceNumber(x,6) for x in vec]))
        run_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        done += len(output)

    # done    
    printStatus(INFO, "%d done. read time %g seconds, run_time %g seconds" % (done, read_time, run_time))
    fw.close()
    return done
Exemplo n.º 25
0
def process(options, pklfile, hdf5file):
    if checkToSkip(hdf5file, options.overwrite):
        return 0

    printStatus(INFO, 'Loading pkl file %s' % pklfile)
    with open(pklfile, 'r') as f:
        data = pkl.load(f)
    printStatus(INFO, 'Found %d elements.' % len(data))

    printStatus(INFO, 'Saving hdf5 file %s' % hdf5file)
    with h5py.File(hdf5file, 'w') as f:
        for k, v in data.items():
            printStatus(INFO, 'Dumping %s' % k)
            f[k] = v

    printStatus(INFO, 'Done.')
Exemplo n.º 26
0
def process(options, pklfile, hdf5file):
    if checkToSkip(hdf5file, options.overwrite):
        return 0

    printStatus(INFO, 'Loading pkl file %s' % pklfile)
    with open(pklfile, 'r') as f:
        data = pkl.load(f)
    printStatus(INFO, 'Found %d elements.' % len(data))

    printStatus(INFO, 'Saving hdf5 file %s' % hdf5file)
    with h5py.File(hdf5file,'w') as f:
        for k,v in data.items():
            printStatus(INFO, 'Dumping %s' % k)
            f[k] = v

    printStatus(INFO, 'Done.')
Exemplo n.º 27
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(
            os.path.join(rootpath, workingCollection, 'TextData',
                         'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x, -1), y)
                        for x, y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(
            tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': map(int, id_images),
                'scores': tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 28
0
def process(options, collection, text_style, threshold):
    logger.info("processing %s ...", collection)
    rootpath = options.rootpath
    overwrite = options.overwrite
    threshold = int(threshold)
    lang = which_language(collection)

    input_file = os.path.join(rootpath, collection, 'TextData',
                              '%s.caption.txt' % collection)
    output_vocab_file = os.path.join(rootpath, collection,
                                     'TextData/vocabulary', text_style,
                                     'word_vocab_%d.txt' % threshold)
    output_vocab_counter_file = os.path.join(
        rootpath, collection, 'TextData/vocabulary', text_style,
        'word_vocab_counter_%d.txt' % threshold)

    if checkToSkip(output_vocab_file, overwrite):
        sys.exit(0)
    makedirsforfile(output_vocab_file)

    word2counter = {}
    for index, line in enumerate(open(input_file)):
        sid, sent = line.strip().split(" ", 1)
        if text_style == "bow":
            sent = clean_str(sent, lang)
        elif text_style == "bow_filterstop":
            sent = clean_str_filter_stop(sent, lang)
        if index == 0:
            logger.info(line.strip())
            logger.info('After processing: %s %s', sid, ' '.join(sent))
        for word in sent:
            word2counter[word] = word2counter.get(word, 0) + 1

    sorted_wordCounter = sorted(word2counter.iteritems(),
                                key=lambda a: a[1],
                                reverse=True)

    output_line_vocab = [x[0] for x in sorted_wordCounter if x[1] >= threshold]
    output_line_vocab_counter = [
        x[0] + ' ' + str(x[1]) for x in sorted_wordCounter if x[1] >= threshold
    ]

    open(output_vocab_file, 'w').write('\n'.join(output_line_vocab))
    open(output_vocab_counter_file,
         'w').write('\n'.join(output_line_vocab_counter))
    logger.info('A vocabulary of %d words has been built for %s',
                len(output_line_vocab), collection)
Exemplo n.º 29
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 30
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    inputfile = options.inputfile
    resultname = options.resultname

    result_file = os.path.join('result', resultname)
    if checkToSkip(result_file, overwrite):
       sys.exit(0)
    makedirsforfile(result_file)

    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)

    num2file = {}
    num2file[0] = os.path.join(rootpath, collection, 'Annotations', 'Image', 'concepts%s.txt' % collection)
    method_count = 1
    for line in open(os.path.join('result',inputfile)).readlines():
        num2file[method_count] = line.strip()
        method_count +=1

    fout = open(result_file, "w")
    
    for qid in qid_list:
        name2feature = {}
        for fnum in xrange(method_count):
            data_file = os.path.join( num2file[fnum], '%s.txt' % qid)
            data = readAnnotations(data_file)
            data.sort(key=lambda v:v[0], reverse=True)
            names = [x[0] for x in data]
            labels = [x[1] for x in data]
            # print 'fnum %d' % fnum
            if fnum == 0:
                key_names = names
                for i in xrange(len(names)):
                    name2feature[names[i]] = [labels[i]]
            else:
                assert(checkSameList(key_names, names))
                for i in xrange(len(names)):
                    name2feature[names[i]].append(labels[i])
        for img in key_names:
            fout.write('%s ' % qid + img + ' ' + ' '.join(name2feature[img]) + '\n')  
    fout.close()

    print 'Combined result of different written into %s' % result_file
Exemplo n.º 31
0
def process(options, feat_dir):
    newname = ''
    if options.ssr:
        newname = 'ssr'
    newname += 'l%d' % options.p
    resfile = os.path.join(feat_dir.rstrip('/\\') + newname, 'feature.bin')
    if checkToSkip(resfile, options.overwrite):
        return 0

    with open(os.path.join(feat_dir, 'shape.txt')) as fr:
        nr_of_images, feat_dim = map(int, fr.readline().strip().split())
        fr.close()
        
    offset = np.float32(1).nbytes * feat_dim
    res = array.array('f')
    
    fr = open(os.path.join(feat_dir,'feature.bin'), 'rb')
    makedirsforfile(resfile)
    fw = open(resfile, 'wb')
    print ('>>> writing results to %s' % resfile)
    

    for i in xrange(nr_of_images):
        res.fromfile(fr, feat_dim)
        vec = res
        if options.ssr:
            vec = [np.sign(x) * np.sqrt(abs(x)) for x in vec]
        if options.p == 1:
            Z = sum(abs(x) for x in vec) + 1e-9
        else:
            Z = np.sqrt(sum([x**2 for x in vec])) + 1e-9
        if i % 1e4 == 0:
            print ('image_%d, norm_%d=%g' % (i, options.p, Z))
        vec = [x/Z for x in vec]
        del res[:]
        vec = np.array(vec, dtype=np.float32)
        vec.tofile(fw)
    fr.close()
    fw.close()
    print ('>>> %d lines parsed' % nr_of_images)
    shutil.copyfile(os.path.join(feat_dir,'id.txt'), os.path.join(os.path.split(resfile)[0], 'id.txt'))
    
    shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt')
    with open(shapefile, 'w') as fw:
        fw.write('%d %d' % (nr_of_images, feat_dim))
        fw.close()
Exemplo n.º 32
0
def process(options, collection):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    
    resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s'%tpp, 'id.tagvotes.txt')
    if checkToSkip(resultfile, overwrite):
        sys.exit(0)    

    imset = readImageSet(collection, collection, rootpath)
    printStatus(INFO, 'working on %d test images -> %s' % (len(imset),resultfile))
    
    reader = TagReader(collection,tpp=tpp,rootpath=rootpath)   
    
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")
    output = []
    done = 0
    
    for im in imset:
        tags = reader.get(im)
        tagSet = set()
        tagSeq = []
        for tag in str.split(tags):
            if tag not in tagSet:
                tagSeq.append(tag)
                tagSet.add(tag)
        assert(len(tagSeq) == len(tagSet))
        
        nr_tags = len(tagSeq)
        tagvotes = [(tagSeq[i], 1.0-float(i)/nr_tags) for i in range(nr_tags)]
        newline = "%s %s" % (im, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes]))
        output.append(newline + "\n")
        done += 1
        
        if len(output)%1e4 == 0:
            printStatus(INFO, '%d %s %s' % (done,im,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] )))
            fw.write("".join(output))
            fw.flush()
            output = []
        
    if output:
        fw.write("".join(output))
    fw.close()
    printStatus(INFO, 'done')
Exemplo n.º 33
0
def process(options, collection, annotationName, runfile, newRunName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    dataset = options.testset if options.testset else collection

    concepts = readConcepts(collection, annotationName, rootpath)
    simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset)

    data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")]
    models = []
    for line in data:
        weight, run = str.split(line)
        models.append((run, float(weight), 1))

    for concept in concepts:
        resultfile = os.path.join(simdir, newRunName, concept + ".txt")
        if checkToSkip(resultfile, overwrite):
            continue

        scorefile = os.path.join(simdir, models[0][0], concept + ".txt")
        if not os.path.exists(scorefile):
            print("%s does not exist. skip" % scorefile)
            continue

        ranklist = readRankingResults(scorefile)
        names = sorted([x[0] for x in ranklist])

        nr_of_images = len(names)
        name2index = dict(zip(names, range(nr_of_images)))

        print("%s %d" % (concept, nr_of_images))

        scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank)
        assert scoreTable.shape[1] == nr_of_images

        weights = [model[1] for model in models]

        scores = np.matrix(weights) * scoreTable
        scores = [float(scores[0, k]) for k in range(nr_of_images)]

        newranklist = [(names[i], scores[i]) for i in range(nr_of_images)]
        newranklist.sort(key=lambda v: (v[1], v[0]), reverse=True)

        writeRankingResults(newranklist, resultfile)
Exemplo n.º 34
0
def process(options, collection, annotationName, runfile, newRunName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    dataset = options.testset if options.testset else collection
    
    concepts = readConcepts(collection, annotationName, rootpath)
    simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset)

    data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")]
    models = []
    for line in data:
        weight,run = str.split(line)
        models.append((run, float(weight), 1))
    
    for concept in concepts:
        resultfile = os.path.join(simdir, newRunName, concept + ".txt")
        if checkToSkip(resultfile, overwrite):
            continue

        scorefile = os.path.join(simdir, models[0][0], concept + ".txt")
        if not os.path.exists(scorefile):
            print ("%s does not exist. skip" % scorefile)
            continue

        ranklist = readRankingResults(scorefile)
        names = sorted([x[0] for x in ranklist])

        nr_of_images = len(names)
        name2index = dict(zip(names, range(nr_of_images)))
   
        print ('%s %d' % (concept, nr_of_images))
        
        scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank)
        assert(scoreTable.shape[1] == nr_of_images)

        weights = [model[1] for model in models]

        scores = np.matrix(weights) * scoreTable
        scores = [float(scores[0,k]) for k in range(nr_of_images)]
  
        newranklist = [(names[i], scores[i]) for i in range(nr_of_images)]
        newranklist.sort(key=lambda v:(v[1],v[0]), reverse=True)
     
        writeRankingResults(newranklist, resultfile)
Exemplo n.º 35
0
def process(options, trainCollection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite
    
    resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName)
    if checkToSkip(resultfile, overwrite):
        return 0
        
    concepts = readConcepts(trainCollection, annotationName, rootpath)
    concept_num = len(concepts)
    concept2index = dict(zip(concepts, range(concept_num)))
    tcb = TagCooccurBase(trainCollection, rootpath=rootpath)
    tag_num = tcb.tag_num()
    DEFAULT_RANK = tag_num
    rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK
    tag_list = []
    
    for i,u in enumerate(tcb.vob):
        ranklist = tcb.top_cooccur(u,-1)
        concept2rank = {}
        rank = [DEFAULT_RANK] * concept_num
        
        hit = 0
        for j,x in enumerate(ranklist):
            idx = concept2index.get(x[0], -1)
            if idx>=0:
                rank_matrix[i,idx] = j+1
                hit += 1
                if hit == concept_num:
                    break
        tag_list.append(u)
        
        if (i+1) % 1e4 == 0:
            printStatus(INFO, '%d done' % (i+1) )
    
    assert(len(tag_list) == tag_num)
    
    import cPickle as pickle
    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1)
    output.close()
    printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
Exemplo n.º 36
0
def process(options, collection):
    rootpath = options.rootpath
    tpp = options.tpp

    tagfile = os.path.join(rootpath, collection, "TextData",
                           "id.userid.%stags.txt" % tpp)
    resultfile = os.path.join(rootpath, collection, "TextData",
                              "%stag.userfreq.imagefreq.txt" % tpp)
    if checkToSkip(resultfile, options.overwrite):
        return 0

    printStatus(INFO, "parsing " + tagfile)

    tag2imfreq = {}
    tag2users = {}

    for line in open(tagfile):
        elems = str.split(line.strip())
        photoid = elems[0]
        userid = elems[1]
        tagset = set(elems[2:])

        for tag in tagset:
            tag2imfreq[tag] = tag2imfreq.get(tag, 0) + 1
            tag2users.setdefault(tag, []).append(userid)

    printStatus(INFO, "collecting user-freq and image-freq")
    results = []
    for tag, users in tag2users.iteritems():
        userfreq = len(set(users))
        imfreq = tag2imfreq[tag]
        results.append((tag, userfreq, imfreq))

    printStatus(INFO, "sorting in descending order (user-freq as primary key)")
    results.sort(key=lambda v: (v[1], v[2]), reverse=True)
    printStatus(INFO, "-> %s" % resultfile)

    with open(resultfile, 'w') as fw:
        fw.write(''.join([
            '%s %d %d\n' % (tag, userfreq, imfreq)
            for (tag, userfreq, imfreq) in results
        ]))
        fw.close()
Exemplo n.º 37
0
def process(options, feat_dir):
    resultfile = os.path.join(feat_dir, "minmax.txt")
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    nr_of_images, feat_dim = map(int, open(os.path.join(feat_dir, "shape.txt")).readline().split())
    min_vals = [1e6] * feat_dim
    max_vals = [-1e6] * feat_dim

    offset = np.float32(1).nbytes * feat_dim
    res = array.array("f")

    feat_file = os.path.join(feat_dir, "feature.bin")
    id_file = os.path.join(feat_dir, "id.txt")
    nr_of_images = len(open(id_file).readline().strip().split())
    printStatus(INFO, "parsing %s" % feat_file)
    fr = open(feat_file, "rb")

    s_time = time.time()

    for i in xrange(nr_of_images):
        res.fromfile(fr, feat_dim)
        vec = res
        for d in xrange(feat_dim):
            if vec[d] > max_vals[d]:
                max_vals[d] = vec[d]
            if vec[d] < min_vals[d]:
                min_vals[d] = vec[d]
        del res[:]
    fr.close()

    timecost = time.time() - s_time
    printStatus(
        INFO,
        "%g seconds to find min [%g,%g] and max [%g,%g]"
        % (timecost, min(min_vals), max(min_vals), min(max_vals), max(max_vals)),
    )

    with open(resultfile, "w") as f:
        f.write("%s\n" % " ".join(map(str, min_vals)))
        f.write("%s\n" % " ".join(map(str, max_vals)))
        f.close()
def submit(searchers,
           collection,
           annotationName,
           rootpath=ROOT_PATH,
           overwrite=0):
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_runs = len(searchers)

    for concept in concepts:
        for j in range(nr_of_runs):
            resultfile = os.path.join(searchers[j].getOutputdir(),
                                      concept + ".txt")
            if checkToSkip(resultfile, overwrite):
                continue
            searchresults = searchers[j].scoreCollection(concept)
            print("%s: %s %d -> %s" %
                  (searchers[j].name, concept, len(searchresults), resultfile))
            writeRankingResults(searchresults, resultfile)

    printStatus('%s.submit' % os.path.basename(__file__), "done")
Exemplo n.º 39
0
def process(options, collection, feature):
    rootpath = options.rootpath
    tpp = options.tpp
    k = 1000  # options.k
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite

    feat_dir = os.path.join(rootpath, collection, "FeatureData", feature)
    feat_file = BigFile(feat_dir)
    hitlists = buildHitLists(collection, tpp, rootpath)
    printStatus(INFO, "nr of tags: %d" % len(hitlists))

    vob = sorted(hitlists.keys())
    vob = [vob[i] for i in range(len(vob)) if i % numjobs == job - 1]
    printStatus(INFO, "working on %d-%d: %d tags" % (numjobs, job, len(vob)))

    for tag_idx, tag in enumerate(vob):
        resultdir = os.path.join(rootpath, collection, "FeatureIndex", feature, tag[:2], tag)
        binfile = os.path.join(resultdir, "feature.bin")
        if checkToSkip(binfile, overwrite):
            continue

        hitlist = hitlists[tag]
        hitlist = hitlist[:k]  # keep at most 1000 images per tag
        renamed, vecs = feat_file.read(hitlist)

        makedirsforfile(binfile)
        np.array(vecs).astype(np.float32).tofile(binfile)
        idfile = os.path.join(resultdir, "id.txt")
        fw = open(idfile, "w")
        fw.write(" ".join(renamed))
        fw.close()

        shapefile = os.path.join(resultdir, "shape.txt")
        fw = open(shapefile, "w")
        fw.write("%d %d" % (len(renamed), len(vecs[0])))
        fw.close()

        if tag_idx % 1e3 == 0:
            printStatus(INFO, "%d - %s, %d images" % (tag_idx, tag, len(hitlist)))
Exemplo n.º 40
0
def process(options, collection, feature):
    rootpath = options.rootpath 
    tpp = options.tpp 
    k = 1000 #options.k
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite
    
    feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
    feat_file = BigFile(feat_dir)
    hitlists = buildHitLists(collection, tpp, rootpath)
    printStatus(INFO, 'nr of tags: %d' % len(hitlists))
    
    vob = sorted(hitlists.keys())
    vob = [vob[i] for i in range(len(vob)) if i%numjobs == job-1]
    printStatus(INFO, 'working on %d-%d: %d tags' % (numjobs, job, len(vob)))
    
    for tag_idx,tag in enumerate(vob):
        resultdir = os.path.join(rootpath, collection, 'FeatureIndex', feature, tag[:2], tag)
        binfile = os.path.join(resultdir, 'feature.bin')
        if checkToSkip(binfile, overwrite):
            continue
            
        hitlist = hitlists[tag]
        hitlist = hitlist[:k] # keep at most 1000 images per tag
        renamed,vecs = feat_file.read(hitlist)
        
        makedirsforfile(binfile)
        np.array(vecs).astype(np.float32).tofile(binfile)
        idfile = os.path.join(resultdir, 'id.txt')
        fw = open(idfile, 'w')
        fw.write(' '.join(renamed))
        fw.close()
        
        shapefile = os.path.join(resultdir, 'shape.txt')
        fw = open(shapefile, 'w')
        fw.write('%d %d' % (len(renamed), len(vecs[0])))
        fw.close()
        
        if tag_idx%1e3 == 0:
            printStatus(INFO, '%d - %s, %d images' % (tag_idx, tag, len(hitlist)))
Exemplo n.º 41
0
def process(options, synset_file, synset_name):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding = options.embedding

    resdir = os.path.join(rootpath, 'synset2vec', synset_name,
                          '%s,%s,%s' % (corpus, word2vec_model, embedding))
    resfile = os.path.join(resdir, 'feature.bin')
    if checkToSkip(resfile, overwrite):
        return 0

    synsets = map(str.strip, open(synset_file).readlines())
    s2v = get_synset_encoder(embedding)(corpus,
                                        word2vec_model,
                                        rootpath=rootpath)
    makedirsforfile(resfile)

    good = []
    with open(resfile, 'wb') as fw:
        for i, wnid in enumerate(synsets):
            #if i % 1e3 == 0:
            #    printStatus(INFO, '%d done' % i)
            vec = s2v.embedding(wnid)

            if vec is not None:
                vec = np.array(vec, dtype=np.float32)
                vec.tofile(fw)
                good.append(wnid)

        fw.close()
        printStatus(INFO, '%d done, %d okay' % ((i + 1), len(good)))

    with open(os.path.join(resdir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(good))
        fw.close()

    with open(os.path.join(resdir, 'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(good), s2v.get_feat_dim()))
        fw.close()
Exemplo n.º 42
0
def process(options, feat_dir, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, 'feature.bin')
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    feat_file = BigFile(feat_dir)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, 'wb')

    done = []
    start = 0
  
    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))
        toread = imset[start:end]
        if len(toread) == 0:
            break
        renamed, vectors = feat_file.read(toread)
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert(len(done) == len(set(done)))
    with open(os.path.join(result_dir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(done))
        fw.close()
    
    with open(os.path.join(result_dir,'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(done), feat_file.ndims))
        fw.close()
    print '%d requested, %d obtained' % (len(imset), len(done))
Exemplo n.º 43
0
def process(options, feat_dir, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, "feature.bin")
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    feat_file = BigFile(feat_dir)

    makedirsforfile(resultfile)
    fw = open(resultfile, "wb")

    done = []
    start = 0

    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, "processing images from %d to %d" % (start, end - 1))
        toread = imset[start:end]
        if len(toread) == 0:
            break
        renamed, vectors = feat_file.read(toread)
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert len(done) == len(set(done))
    with open(os.path.join(result_dir, "id.txt"), "w") as fw:
        fw.write(" ".join(done))
        fw.close()

    with open(os.path.join(result_dir, "shape.txt"), "w") as fw:
        fw.write("%d %d" % (len(done), feat_file.ndims))
        fw.close()
    print "%d requested, %d obtained" % (len(imset), len(done))
Exemplo n.º 44
0
def process(options, synset_file, synset_name):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding = options.embedding

    resdir = os.path.join(rootpath, 'synset2vec', synset_name, '%s,%s,%s' % (corpus, word2vec_model, embedding))
    resfile = os.path.join(resdir, 'feature.bin')
    if checkToSkip(resfile, overwrite):
        return 0

    synsets = map(str.strip, open(synset_file).readlines())
    s2v = get_synset_encoder(embedding)(corpus, word2vec_model, rootpath=rootpath)
    makedirsforfile(resfile)

    good = []
    with open(resfile, 'wb') as fw:
        for i,wnid in enumerate(synsets):
            #if i % 1e3 == 0:
            #    printStatus(INFO, '%d done' % i)
            vec = s2v.embedding(wnid)

            if vec is not None:
                vec = np.array(vec, dtype=np.float32)
                vec.tofile(fw)
                good.append(wnid)

        fw.close()
        printStatus(INFO, '%d done, %d okay' % ((i+1), len(good)))
 

    with open(os.path.join(resdir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(good))
        fw.close()

    with open(os.path.join(resdir, 'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(good), s2v.get_feat_dim()))
        fw.close() 
Exemplo n.º 45
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': map(int, id_images),
                'scores': tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 46
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 47
0
def process(options, collection):
    rootpath = options.rootpath
    tpp = options.tpp

    tagfile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp)
    resultfile = os.path.join(rootpath, collection, "TextData", "%stag.userfreq.imagefreq.txt" % tpp)
    if checkToSkip(resultfile, options.overwrite):
        return 0
        
    printStatus(INFO, "parsing " + tagfile)
       
    tag2imfreq = {}
    tag2users = {}

    for line in open(tagfile):
        elems = str.split(line.strip())
        photoid = elems[0]
        userid = elems[1]
        tagset = set(elems[2:])
            
        for tag in tagset:
            tag2imfreq[tag] = tag2imfreq.get(tag, 0) + 1
            tag2users.setdefault(tag,[]).append(userid)
            
    printStatus(INFO, "collecting user-freq and image-freq")
    results = []
    for tag,users in tag2users.iteritems():
        userfreq = len(set(users))
        imfreq = tag2imfreq[tag]
        results.append((tag, userfreq, imfreq))
    
    printStatus(INFO, "sorting in descending order (user-freq as primary key)")
    results.sort(key=lambda v:(v[1],v[2]), reverse=True)
    printStatus(INFO, "-> %s" % resultfile)

    with open(resultfile, 'w') as fw:
        fw.write(''.join(['%s %d %d\n' % (tag, userfreq, imfreq) for (tag, userfreq, imfreq) in results]))
        fw.close()
Exemplo n.º 48
0
def process(options, tagfile, tpp):
    if "stem" == tpp:
        worker = nltk.PorterStemmer()
        func = stemming
    else:
        worker = nltk.WordNetLemmatizer()
        func = lemmatize

    resultfile = os.path.join(
        os.path.split(tagfile)[0], 'id.userid.%stags.txt' % tpp)
    if checkToSkip(resultfile, options.overwrite):
        return 0

    makedirsforfile(resultfile)

    fw = codecs.open(resultfile, "w", encoding='utf8')
    parsed = 0
    obtained = 0
    for line in open(tagfile):
        elems = line.strip().split()
        parsed += 1
        if len(elems) > 2:
            newtags = []
            for tag in elems[2:]:
                try:
                    newtag = func(worker, tag.lower())
                except:
                    newtag = tag

                newtags.append(newtag.decode('utf-8'))

            newline = "\t".join([elems[0], elems[1], " ".join(newtags)])
            fw.write('%s\n' % newline)
            obtained += 1
    fw.close()
    print('%d lines parsed, %d records obtained' % (parsed, obtained))
Exemplo n.º 49
0
def process(options, inputfile, resultfile):
    assert(inputfile.endswith('.pkl'))
    #resultfile = inputfile[:-4] + '_rank.pkl'
    
    if checkToSkip(resultfile, options.overwrite):
        return 0
    
    import cPickle as pickle
    data = pickle.load(open(inputfile,'rb'))
    scores = data['scores']
    id_images = data['id_images']
    concepts = data['concepts']
    nr_of_images = len(id_images)
    nr_of_concepts = len(concepts)
    
    assert(scores.shape[0] == nr_of_images)
    assert(scores.shape[1] == nr_of_concepts)
 
    DEFAULT_RANK = nr_of_concepts 
    rank_matrix = np.zeros((nr_of_images, nr_of_concepts), dtype=np.int) + DEFAULT_RANK
    
    for i in xrange(nr_of_images):
        sorted_index = np.argsort(scores[i,:]) # in ascending order
        for j in range(nr_of_concepts):
            c_idx = sorted_index[j]
            rank = nr_of_concepts - j
            rank_matrix[i, c_idx] = rank
    
        if (i+1) % 1e5 == 0:
            printStatus(INFO, '%d done' % (i+1) )
    
    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'id_images':id_images, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1)
    output.close()
    printStatus(INFO, '%dx%d dumped to %s' % (nr_of_images, nr_of_concepts, resultfile))
Exemplo n.º 50
0
def main():
    opt = parse_args()
    print(json.dumps(vars(opt), indent=2))

    rootpath = opt.rootpath
    trainCollection = opt.trainCollection
    valCollection = opt.valCollection
    testCollection = opt.testCollection

    if opt.loss_fun == "mrl" and opt.measure == "cosine":
        assert opt.text_norm is True
        assert opt.visual_norm is True

    # checkpoint path
    model_info = '%s_concate_%s_dp_%.1f_measure_%s' % (
        opt.model, opt.concate, opt.dropout, opt.measure)
    # text-side multi-level encoding info
    text_encode_info = 'vocab_%s_word_dim_%s_text_rnn_size_%s_text_norm_%s' % \
            (opt.vocab, opt.word_dim, opt.text_rnn_size, opt.text_norm)
    text_encode_info += "_kernel_sizes_%s_num_%s" % (opt.text_kernel_sizes,
                                                     opt.text_kernel_num)
    # video-side multi-level encoding info
    visual_encode_info = 'visual_feature_%s_visual_rnn_size_%d_visual_norm_%s' % \
            (opt.visual_feature, opt.visual_rnn_size, opt.visual_norm)
    visual_encode_info += "_kernel_sizes_%s_num_%s" % (opt.visual_kernel_sizes,
                                                       opt.visual_kernel_num)
    # common space learning info
    mapping_info = "mapping_text_%s_img_%s" % (opt.text_mapping_layers,
                                               opt.visual_mapping_layers)
    loss_info = 'loss_func_%s_margin_%s_direction_%s_max_violation_%s_cost_style_%s' % \
                    (opt.loss_fun, opt.margin, opt.direction, opt.max_violation, opt.cost_style)
    optimizer_info = 'optimizer_%s_lr_%s_decay_%.2f_grad_clip_%.1f_val_metric_%s' % \
                    (opt.optimizer, opt.learning_rate, opt.lr_decay_rate, opt.grad_clip, opt.val_metric)

    opt.logger_name = os.path.join(rootpath, trainCollection, opt.cv_name,
                                   valCollection, model_info, text_encode_info,
                                   visual_encode_info, mapping_info, loss_info,
                                   optimizer_info, opt.postfix)
    print(opt.logger_name)

    if checkToSkip(os.path.join(opt.logger_name, 'model_best.pth.tar'),
                   opt.overwrite):
        sys.exit(0)
    if checkToSkip(os.path.join(opt.logger_name, 'val_metric.txt'),
                   opt.overwrite):
        sys.exit(0)
    makedirsforfile(os.path.join(opt.logger_name, 'val_metric.txt'))
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    opt.text_kernel_sizes = map(int, opt.text_kernel_sizes.split('-'))
    opt.visual_kernel_sizes = map(int, opt.visual_kernel_sizes.split('-'))
    # collections: trian, val
    collections = {'train': trainCollection, 'val': valCollection}
    cap_file = {
        'train': '%s.caption.txt' % trainCollection,
        'val': '%s.caption.txt' % valCollection
    }
    # caption
    caption_files = {
        x: os.path.join(rootpath, collections[x], 'TextData', cap_file[x])
        for x in collections
    }
    # Load visual features
    visual_feat_path = {
        x: os.path.join(rootpath, collections[x], 'FeatureData',
                        opt.visual_feature)
        for x in collections
    }
    visual_feats = {x: BigFile(visual_feat_path[x]) for x in visual_feat_path}
    opt.visual_feat_dim = visual_feats['train'].ndims

    # set bow vocabulary and encoding
    bow_vocab_file = os.path.join(rootpath, opt.trainCollection, 'TextData',
                                  'vocabulary', 'bow', opt.vocab + '.pkl')
    bow_vocab = pickle.load(open(bow_vocab_file, 'rb'))
    bow2vec = get_text_encoder('bow')(bow_vocab)
    opt.bow_vocab_size = len(bow_vocab)

    # set rnn vocabulary
    rnn_vocab_file = os.path.join(rootpath, opt.trainCollection, 'TextData',
                                  'vocabulary', 'rnn', opt.vocab + '.pkl')
    rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb'))
    opt.vocab_size = len(rnn_vocab)

    # initialize word embedding
    opt.we_parameter = None
    if opt.word_dim == 500:
        w2v_data_path = os.path.join(rootpath, "word2vec", 'flickr',
                                     'vec500flickr30m')
        opt.we_parameter = get_we_parameter(rnn_vocab, w2v_data_path)

    # mapping layer structure
    opt.text_mapping_layers = map(int, opt.text_mapping_layers.split('-'))
    opt.visual_mapping_layers = map(int, opt.visual_mapping_layers.split('-'))
    if opt.concate == 'full':
        opt.text_mapping_layers[
            0] = opt.bow_vocab_size + opt.text_rnn_size * 2 + opt.text_kernel_num * len(
                opt.text_kernel_sizes)
        opt.visual_mapping_layers[
            0] = opt.visual_feat_dim + opt.visual_rnn_size * 2 + opt.visual_kernel_num * len(
                opt.visual_kernel_sizes)
    elif opt.concate == 'reduced':
        opt.text_mapping_layers[
            0] = opt.text_rnn_size * 2 + opt.text_kernel_num * len(
                opt.text_kernel_sizes)
        opt.visual_mapping_layers[
            0] = opt.visual_rnn_size * 2 + opt.visual_kernel_num * len(
                opt.visual_kernel_sizes)
    else:
        raise NotImplementedError('Model %s not implemented' % opt.model)

    # set data loader
    video2frames = {
        x: read_dict(
            os.path.join(rootpath, collections[x], 'FeatureData',
                         opt.visual_feature, 'video2frames.txt'))
        for x in collections
    }
    data_loaders = data.get_data_loaders(caption_files,
                                         visual_feats,
                                         rnn_vocab,
                                         bow2vec,
                                         opt.batch_size,
                                         opt.workers,
                                         opt.n_caption,
                                         video2frames=video2frames)

    # Construct the model
    model = get_model(opt.model)(opt)
    opt.we_parameter = None

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, data_loaders['val'], model, measure=opt.measure)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    no_impr_counter = 0
    lr_counter = 0
    best_epoch = None
    fout_val_metric_hist = open(
        os.path.join(opt.logger_name, 'val_metric_hist.txt'), 'w')
    for epoch in range(opt.num_epochs):
        print('Epoch[{0} / {1}] LR: {2}'.format(
            epoch, opt.num_epochs,
            get_learning_rate(model.optimizer)[0]))
        print('-' * 10)
        # train for one epoch
        train(opt, data_loaders['train'], model, epoch)

        # evaluate on validation set
        rsum = validate(opt, data_loaders['val'], model, measure=opt.measure)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        print(' * Current perf: {}'.format(rsum))
        print(' * Best perf: {}'.format(best_rsum))
        print('')
        fout_val_metric_hist.write('epoch_%d: %f\n' % (epoch, rsum))
        fout_val_metric_hist.flush()

        if is_best:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': model.state_dict(),
                    'best_rsum': best_rsum,
                    'opt': opt,
                    'Eiters': model.Eiters,
                },
                is_best,
                filename='checkpoint_epoch_%s.pth.tar' % epoch,
                prefix=opt.logger_name + '/',
                best_epoch=best_epoch)
            best_epoch = epoch

        lr_counter += 1
        decay_learning_rate(opt, model.optimizer, opt.lr_decay_rate)
        if not is_best:
            # Early stop occurs if the validation performance does not improve in ten consecutive epochs
            no_impr_counter += 1
            if no_impr_counter > 10:
                print('Early stopping happended.\n')
                break

            # When the validation performance decreased after an epoch,
            # we divide the learning rate by 2 and continue training;
            # but we use each learning rate for at least 3 epochs.
            if lr_counter > 2:
                decay_learning_rate(opt, model.optimizer, 0.5)
                lr_counter = 0
        else:
            no_impr_counter = 0

    fout_val_metric_hist.close()

    print('best performance on validation: {}\n'.format(best_rsum))
    with open(os.path.join(opt.logger_name, 'val_metric.txt'), 'w') as fout:
        fout.write('best performance on validation: ' + str(best_rsum))

    # generate evaluation shell script
    if testCollection == 'iacc.3':
        templete = ''.join(open('util/TEMPLATE_do_predict.sh').readlines())
        striptStr = templete.replace('@@@query_sets@@@',
                                     'tv16.avs.txt,tv17.avs.txt,tv18.avs.txt')
    else:
        templete = ''.join(open('util/TEMPLATE_do_test.sh').readlines())
        striptStr = templete.replace('@@@n_caption@@@', str(opt.n_caption))
    striptStr = striptStr.replace('@@@rootpath@@@', rootpath)
    striptStr = striptStr.replace('@@@testCollection@@@', testCollection)
    striptStr = striptStr.replace('@@@logger_name@@@', opt.logger_name)
    striptStr = striptStr.replace('@@@overwrite@@@', str(opt.overwrite))

    # perform evaluation on test set
    runfile = 'do_test_%s_%s.sh' % (opt.model, testCollection)
    open(runfile, 'w').write(striptStr + '\n')
    os.system('chmod +x %s' % runfile)
Exemplo n.º 51
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma = options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex',
                                       collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method,
                                   feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations',
                                  'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query = dict(zip(qid_list, query_list))

    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")

    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(
            collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(
                calParzen(img_feats.read_one(imidx), test_X, sigma))

        # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma)
        # parzen_list = []
        # for imidx in iid_list:
        #     parzen_list.append(parzen_list_suffle[renamed.index(imidx)])

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list),
                              key=lambda v: v[2],
                              reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}

    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) /
                                  len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file, 'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Exemplo n.º 52
0
    nr_neg_bags = cmdOpts.getInt('nr_neg_bags')
    nr_neg = nr_pos * neg_pos_ratio

    concepts = readConcepts(collection, annotationName)
    annotationNameStr = generate_new_annotation_template(cmdOpts)

    nr_skipped = 0
    newAnnotationNames = [None] * (nr_pos_bags * nr_neg_bags)

    for idxp in range(nr_pos_bags):
        for idxn in range(nr_neg_bags):
            anno_idx = idxp * nr_neg_bags + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath, collection, 'Annotations',
                                      newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, overwrite):
                nr_skipped += 1
                continue
            writeConcepts(concepts, resultfile)

    first, second, last = annotationNameStr.split('%d')
    scriptfile = os.path.join(
        rootpath, collection, 'annotationfiles', first + '0-%d' %
        (nr_pos_bags - 1) + second + '0-%d' % (nr_neg_bags - 1) + last)
    makedirsforfile(scriptfile)
    fout = open(scriptfile, 'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if nr_skipped == (nr_pos_bags * nr_neg_bags):
        sys.exit(0)
Exemplo n.º 53
0
    freq_threshold = int(sys.argv[3])
    overwrite = int(sys.argv[4])

    for collection in collection_list:
        print "processing %s ..." % collection

        input_file = os.path.join(
            rootpath, "%s/TextData/%s.caption.txt" % (collection, collection))
        output_vocab_file = os.path.join(
            rootpath, "%s/TextData/vocabulary/%s/word_vocab_%d.txt" %
            (collection, text_style, freq_threshold))
        output_vocab_counter_file = os.path.join(
            rootpath, "%s/TextData/vocabulary/%s/word_vocab_counter_%d.txt" %
            (collection, text_style, freq_threshold))

        if checkToSkip(output_vocab_file, overwrite):
            sys.exit(0)
        if checkToSkip(output_vocab_counter_file, overwrite):
            sys.exit(0)
        makedirsforfile(output_vocab_file)

        word2counter = {}
        len2counter = {}
        for index, line in enumerate(open(input_file)):
            sid, sent = line.strip().split(" ", 1)
            if text_style == "bow":
                sent = clean_str(sent)
            elif text_style == "bow_filterstop":
                sent = clean_str_filter_stop(sent)
            length = len(sent)
            len2counter[length] = len2counter.get(length, 0) + 1
Exemplo n.º 54
0
def process(options, trainCollection, trainAnnotationName, valCollection,
            valAnnotationName, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}

    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    valConcepts = readConcepts(valCollection,
                               valAnnotationName,
                               rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert (concepts[i] == valConcepts[i])

    resultdir = os.path.join(
        rootpath, trainCollection, 'Models', trainAnnotationName,
        '%s,best_params' % modelName,
        '%s,%s,%s' % (valCollection, valAnnotationName, feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    val_feat_file = BigFile(
        os.path.join(rootpath, valCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims
    assert (feat_dim == val_feat_file.ndims)

    for concept in todo:
        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        names, labels = readAnnotationsFrom(valCollection,
                                            valAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        val_name2label = dict(zip(names, labels))
        val_renamed, val_vectors = val_feat_file.read(names)

        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i]))
                        for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v: v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C

        [A, B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)

        printStatus(
            INFO,
            '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' %
            (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()