示例#1
0
def find_best_beta(labels, features, cv, scorer, min_vals, max_vals):
    dim = len(features[0])
    beta_set = [x/10.0 for x in range(1,10)]
    perfs = [[] for i in range(len(beta_set))]

    n = len(labels)

    positive_index = [i for i in range(n) if  1 == labels[i]]
    negative_index = [i for i in range(n) if -1 == labels[i]]
    num_positive = len(positive_index)
    num_negative = len(negative_index)

    if num_positive < cv:
        message = "[find_best_beta] %d positive examples, insufficient for %d-fold cross-validation" % (len(positive_index), cv)
        raise Exception(message)

    for folder in range(cv):
        print ("[find_best_beta] %d <- %s" % (folder, "-".join(map(str, [i for i in range(cv) if i!=folder]))))

        labels_val = [1 for i in range(num_positive) if i%cv == folder] + [-1 for i in range(num_negative) if i%cv == folder]
        features_val = ([features[positive_index[i]] for i in range(num_positive) if i%cv == folder] +
                        [features[negative_index[i]] for i in range(num_negative) if i%cv == folder])

        labels_train = [1 for i in range(num_positive) if i%cv != folder] + [-1 for i in range(num_negative) if i%cv != folder]
        features_train = ([features[positive_index[i]] for i in range(num_positive) if i%cv != folder] +
                          [features[negative_index[i]] for i in range(num_negative) if i%cv != folder])

        assert(len(labels_val) == len(features_val))
        assert(len(labels_train) == len(features_train))
        assert((len(labels_val)+len(labels_train)) == n)

        for index,beta in enumerate(beta_set):
            model = hiksvm_train(labels_train, features_train, beta=beta)
            #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50)
            fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=dim, min_vals=min_vals, max_vals=max_vals, num_bins=50)
            results = [(labels_val[i], fikmodel.predict(features_val[i])) for i in range(len(labels_val))]
            results.sort(key=lambda v:(v[1]), reverse=True)
            sorted_labels = [x[0] for x in results]
            perf = scorer.score(sorted_labels)
            print "[find_best_beta] folder %d, beta %g -> %s=%g" % (folder, beta, scorer.name(), perf)
            perfs[index].append(perf)

    ranklist = [(beta_set[index], np.mean(perfs[index])) for index in range(len(beta_set))]
    ranklist.sort(key=lambda v:(v[1]), reverse=True)
    print "[find_best_beta] done", ranklist
    best_beta = ranklist[0][0]
    cv_score = ranklist[0][1]
    return best_beta, cv_score
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(
        r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    nr_bins = options.nr_bins
    best_param_dir = options.best_param_dir
    beta = 0.5

    modelName = 'fik%d' % nr_bins
    if best_param_dir:
        modelName += '-tuned'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models',
                             trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    feat_file = BigFile(feat_dir)
    params = {'nr_bins': nr_bins}

    with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f:
        params['min_vals'] = map(float, str.split(f.readline()))
        params['max_vals'] = map(float, str.split(f.readline()))

    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))

        model_file_name = os.path.join(resultdir, concept + '.model')

        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
        model = svm_train(
            y, vectors,
            svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI"))
        newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s' % model_file_name)
        fiksvm_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fiksvm_load_model(model_file_name)
        assert (abs(newmodel.get_probAB()[0] - A) < 1e-6)
        assert (abs(newmodel.get_probAB()[1] - B) < 1e-6)

    return len(todo)
示例#3
0
    with open(minmax_file, 'r') as f:
        min_vals = map(float, str.split(f.readline()))
        max_vals = map(float, str.split(f.readline()))


    [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath)
    name2label = dict(zip(names,labels))
    (renamed, vectors) = train_feat_file.read(names)
    relabeled = [name2label[x] for x in renamed] #label is either 1 or -1
    
    [names,labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath)
    test2label = dict(zip(names,labels))
    

    for beta in [0.5]: #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        #model = hiksvm_train(relabeled, vectors, beta=beta)
        cv = 3
        best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv, scorer, min_vals, max_vals)
        print best_beta, cv_score
        #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50)
        fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=feat_dim, min_vals=min_vals, max_vals=max_vals, num_bins=50)
        
        results = classifyLargeData(fikmodel, testImageSet, test_feat_file, prob_output=True)
        print results[:5]

        sorted_labels = [test2label[x] for x,y in results]
        score = scorer.score(sorted_labels)
        print "beta", beta, "AP", score
  
            
示例#4
0
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    nr_bins = options.nr_bins
    best_param_dir = options.best_param_dir
    beta = 0.5
    
    modelName = 'fik%d' % nr_bins
    if best_param_dir:
        modelName += '-tuned'
    
    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0
    
    feat_dir = os.path.join(rootpath,trainCollection,'FeatureData',feature)
    feat_file = BigFile(feat_dir)
    params = {'nr_bins': nr_bins}

    with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f:
        params['min_vals'] = map(float, str.split(f.readline()))
        params['max_vals'] = map(float, str.split(f.readline()))
        
    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))
        
        model_file_name = os.path.join(resultdir, concept + '.model')
        
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
        model = svm_train(y, vectors, svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI"))
        newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s'%model_file_name)
        fiksvm_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fiksvm_load_model(model_file_name)
        assert(abs(newmodel.get_probAB()[0]-A)<1e-6)
        assert(abs(newmodel.get_probAB()[1]-B)<1e-6)

    return len(todo)
示例#5
0
文件: hiksvm.py 项目: xiaojiew1/KDGAN
def find_best_beta(labels, features, cv, scorer, min_vals, max_vals):
    dim = len(features[0])
    beta_set = [x / 10.0 for x in range(1, 10)]
    perfs = [[] for i in range(len(beta_set))]

    n = len(labels)

    positive_index = [i for i in range(n) if 1 == labels[i]]
    negative_index = [i for i in range(n) if -1 == labels[i]]
    num_positive = len(positive_index)
    num_negative = len(negative_index)

    if num_positive < cv:
        message = "[find_best_beta] %d positive examples, insufficient for %d-fold cross-validation" % (
            len(positive_index), cv)
        raise Exception(message)

    for folder in range(cv):
        print(
            "[find_best_beta] %d <- %s" %
            (folder, "-".join(map(str, [i
                                        for i in range(cv) if i != folder]))))

        labels_val = [1 for i in range(num_positive) if i % cv == folder] + [
            -1 for i in range(num_negative) if i % cv == folder
        ]
        features_val = ([
            features[positive_index[i]]
            for i in range(num_positive) if i % cv == folder
        ] + [
            features[negative_index[i]]
            for i in range(num_negative) if i % cv == folder
        ])

        labels_train = [1 for i in range(num_positive) if i % cv != folder] + [
            -1 for i in range(num_negative) if i % cv != folder
        ]
        features_train = ([
            features[positive_index[i]]
            for i in range(num_positive) if i % cv != folder
        ] + [
            features[negative_index[i]]
            for i in range(num_negative) if i % cv != folder
        ])

        assert (len(labels_val) == len(features_val))
        assert (len(labels_train) == len(features_train))
        assert ((len(labels_val) + len(labels_train)) == n)

        for index, beta in enumerate(beta_set):
            model = hiksvm_train(labels_train, features_train, beta=beta)
            #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50)
            fikmodel = svm_to_fiksvm([model],
                                     1, [1.0],
                                     feat_dim=dim,
                                     min_vals=min_vals,
                                     max_vals=max_vals,
                                     num_bins=50)
            results = [(labels_val[i], fikmodel.predict(features_val[i]))
                       for i in range(len(labels_val))]
            results.sort(key=lambda v: (v[1]), reverse=True)
            sorted_labels = [x[0] for x in results]
            perf = scorer.score(sorted_labels)
            print "[find_best_beta] folder %d, beta %g -> %s=%g" % (
                folder, beta, scorer.name(), perf)
            perfs[index].append(perf)

    ranklist = [(beta_set[index], np.mean(perfs[index]))
                for index in range(len(beta_set))]
    ranklist.sort(key=lambda v: (v[1]), reverse=True)
    print "[find_best_beta] done", ranklist
    best_beta = ranklist[0][0]
    cv_score = ranklist[0][1]
    return best_beta, cv_score
示例#6
0
文件: hiksvm.py 项目: xiaojiew1/KDGAN
    [names, labels] = readAnnotationsFrom(collection=testCollection,
                                          annotationName=testAnnotationName,
                                          concept=targetConcept,
                                          rootpath=rootpath)
    test2label = dict(zip(names, labels))

    for beta in [0.5]:  #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        #model = hiksvm_train(relabeled, vectors, beta=beta)
        cv = 3
        best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv,
                                                     scorer, min_vals,
                                                     max_vals)
        print best_beta, cv_score
        #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50)
        fikmodel = svm_to_fiksvm([model],
                                 1, [1.0],
                                 feat_dim=feat_dim,
                                 min_vals=min_vals,
                                 max_vals=max_vals,
                                 num_bins=50)

        results = classifyLargeData(fikmodel,
                                    testImageSet,
                                    test_feat_file,
                                    prob_output=True)
        print results[:5]

        sorted_labels = [test2label[x] for x, y in results]
        score = scorer.score(sorted_labels)
        print "beta", beta, "AP", score