예제 #1
0
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature):
    rootpath = options.rootpath
    modelName = options.model

    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model


    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath)
    concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job]

    feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature))

    for concept in concepts:
        modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept)
        model = load_model(modelfile)
        (A0, B0) = model.get_probAB()
        if abs(A0) > 1e-8 and not options.overwrite:
            printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0))
            continue
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names, labels))
        results = classify_large_data(model, names, feat_file, prob_output=False)
        labels = [name2label[x[0]] for x in results]
        dec_values = [x[1] for x in results]
        printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1])))
        [A,B] = sigmoid_train(dec_values, labels)
        model.set_probAB(A, B)
        save_model(modelfile, model)
        (A1, B1) = model.get_probAB()
        printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
예제 #2
0
def process(options, trainCollection, trainAnnotationName, valCollection,
            valAnnotationName, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}

    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    valConcepts = readConcepts(valCollection,
                               valAnnotationName,
                               rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert (concepts[i] == valConcepts[i])

    resultdir = os.path.join(
        rootpath, trainCollection, 'Models', trainAnnotationName,
        '%s,best_params' % modelName,
        '%s,%s,%s' % (valCollection, valAnnotationName, feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    val_feat_file = BigFile(
        os.path.join(rootpath, valCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims
    assert (feat_dim == val_feat_file.ndims)

    for concept in todo:
        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        names, labels = readAnnotationsFrom(valCollection,
                                            valAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        val_name2label = dict(zip(names, labels))
        val_renamed, val_vectors = val_feat_file.read(names)

        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i]))
                        for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v: v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C

        [A, B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)

        printStatus(
            INFO,
            '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' %
            (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
예제 #3
0
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}
    
    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'


    
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert(concepts[i] == valConcepts[i])
    
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims
    assert(feat_dim == val_feat_file.ndims)

    
    for concept in todo:
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath)
        val_name2label = dict(zip(names,labels))
        val_renamed, val_vectors = val_feat_file.read(names)
        
        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v:v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C
                
        [A,B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        
        printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()