예제 #1
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}
    
    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model
 
    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames)+1):
            names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath)
            name2label = dict(zip(names,labels))
            renamed,vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if  1 == lab])
            nn = len([1 for lab in labels if  -1== lab])
            wp = float(beta) * (np+nn) / np
            wn = (1.0-beta) * (np+nn) /nn
    
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
           
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)            
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
예제 #2
0
파일: negbp.py 프로젝트: Peratham/jingwei
    def learn(concept, params):
        rootpath = params['rootpath']
        trainCollection = params['trainCollection']
        baseAnnotationName = params['baseAnnotationName']
        startAnnotationName = params['startAnnotationName']
        strategy = params['strategy']
        feature = params['feature']
        feat_file = params['feat_file']
        feat_dim = feat_file.ndims
        npr = params['npr']
        iterations = params['iterations']
        beta = 0.5
        
        names,labels = readAnnotationsFrom(trainCollection, startAnnotationName, concept, skip_0=True, rootpath=rootpath)
        positive_bag = [x[0] for x in zip(names,labels) if x[1] > 0]
        negative_bag = [x[0] for x in zip(names,labels) if x[1] < 0]

        names,labels = readAnnotationsFrom(trainCollection, baseAnnotationName, concept, skip_0=True, rootpath=rootpath)
        negative_pool = [x[0] for x in zip(names,labels) if x[1] < 0]

        Usize = max(5000, len(positive_bag) * npr)
        Usize = min(10000, Usize)
        Usize = min(Usize, len(negative_pool))

        new_model = None
         
        for t in range(1, iterations+1):
            printStatus(INFO, 'iter %d (%s)' % (t, concept))
            if t > 1: # select relevant negative examples 
                # check how good at classifying positive training examples
                results = classify_large_data(assemble_model, positive_bag, feat_file)
                pos_error_rate = len([1 for x in results if x[1]<0])/float(len(results))
 
                U = random.sample(negative_pool, Usize)
                predictions = classify_large_data(assemble_model, U, feat_file)
                neg_error_rate = len([1 for x in predictions if x[1]>0])/float(len(predictions))               
               
                error_rate = (pos_error_rate + neg_error_rate)/2.0

                printStatus(INFO, 'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f' % (t, predictions[-1][0], predictions[-1][1], 
                                                                                                    predictions[0][0], predictions[0][1], 
                                                                                                    pos_error_rate, neg_error_rate, error_rate))
                if error_rate < MIN_ERROR_RATE:
                    printStatus(INFO, 'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)' % (error_rate, MIN_ERROR_RATE))
                    break

                # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top
                # so ignore them
                nr_of_estimated_pos = int(len(predictions)*0.01)
                negative_bag = NegativeBootstrap.sampling(predictions[nr_of_estimated_pos:], strategy, max(1000, len(positive_bag)))

            new_names = positive_bag + negative_bag
            new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag)
            name2label = dict(zip(new_names,new_labels))
            renamed, vectors = feat_file.read(new_names)
            Ys = [name2label[x] for x in renamed] 

            np = len([1 for y in Ys if y>0])
            nn = len([1 for y in Ys if y<0])
            assert(len(positive_bag) == np)
            assert(len(negative_bag) == nn) 
            wp = float(beta) * (np+nn) / np
            wn = (1.0-beta) * (np+nn) /nn
            C = 1
            svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            if 'fik' == params['model']:
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2'
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t)

        return assemble_model
예제 #3
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}

    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model

    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [
        x.strip() for x in open(annotationfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations',
                                   annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationNames[0],
                            rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models',
                             newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames) + 1):
            names, labels = readAnnotationsFrom(trainCollection,
                                                trainAnnotationNames[t - 1],
                                                concept,
                                                skip_0=True,
                                                rootpath=rootpath)
            name2label = dict(zip(names, labels))
            renamed, vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if 1 == lab])
            nn = len([1 for lab in labels if -1 == lab])
            wp = float(beta) * (np + nn) / np
            wn = (1.0 - beta) * (np + nn) / nn

            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '

            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
예제 #4
0
파일: negbp.py 프로젝트: xiaojiew1/KDGAN
    def learn(concept, params):
        rootpath = params['rootpath']
        trainCollection = params['trainCollection']
        baseAnnotationName = params['baseAnnotationName']
        startAnnotationName = params['startAnnotationName']
        strategy = params['strategy']
        feature = params['feature']
        feat_file = params['feat_file']
        feat_dim = feat_file.ndims
        npr = params['npr']
        iterations = params['iterations']
        beta = 0.5

        names, labels = readAnnotationsFrom(trainCollection,
                                            startAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        positive_bag = [x[0] for x in zip(names, labels) if x[1] > 0]
        negative_bag = [x[0] for x in zip(names, labels) if x[1] < 0]

        names, labels = readAnnotationsFrom(trainCollection,
                                            baseAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        negative_pool = [x[0] for x in zip(names, labels) if x[1] < 0]

        Usize = max(5000, len(positive_bag) * npr)
        Usize = min(10000, Usize)
        Usize = min(Usize, len(negative_pool))

        new_model = None

        for t in range(1, iterations + 1):
            printStatus(INFO, 'iter %d (%s)' % (t, concept))
            if t > 1:  # select relevant negative examples
                # check how good at classifying positive training examples
                results = classify_large_data(assemble_model, positive_bag,
                                              feat_file)
                pos_error_rate = len([1 for x in results if x[1] < 0]) / float(
                    len(results))

                U = random.sample(negative_pool, Usize)
                predictions = classify_large_data(assemble_model, U, feat_file)
                neg_error_rate = len([1 for x in predictions if x[1] > 0
                                      ]) / float(len(predictions))

                error_rate = (pos_error_rate + neg_error_rate) / 2.0

                printStatus(
                    INFO,
                    'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f'
                    % (t, predictions[-1][0], predictions[-1][1],
                       predictions[0][0], predictions[0][1], pos_error_rate,
                       neg_error_rate, error_rate))
                if error_rate < MIN_ERROR_RATE:
                    printStatus(
                        INFO,
                        'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)'
                        % (error_rate, MIN_ERROR_RATE))
                    break

                # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top
                # so ignore them
                nr_of_estimated_pos = int(len(predictions) * 0.01)
                negative_bag = NegativeBootstrap.sampling(
                    predictions[nr_of_estimated_pos:], strategy,
                    max(1000, len(positive_bag)))

            new_names = positive_bag + negative_bag
            new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag)
            name2label = dict(zip(new_names, new_labels))
            renamed, vectors = feat_file.read(new_names)
            Ys = [name2label[x] for x in renamed]

            np = len([1 for y in Ys if y > 0])
            nn = len([1 for y in Ys if y < 0])
            assert (len(positive_bag) == np)
            assert (len(negative_bag) == nn)
            wp = float(beta) * (np + nn) / np
            wn = (1.0 - beta) * (np + nn) / nn
            C = 1
            svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            if 'fik' == params['model']:
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2'
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t)

        return assemble_model