示例#1
0
    def multi_predict(i):
        if args.pq:
            ex_desc = prep.uncompress(pos_desc[i])
        else:
            ex_desc = pc.loadDescriptors(files[i])
        ex_desc = prep.transform(ex_desc)
        score = []
        for e, cl in enumerate(ex_cls):
            if e == i:
                sc = np.zeros(ex_desc.shape[0])
            else:
                sc = cl.decision_function(ex_desc)
                # TODO: maybe add here platt-normalization
            score.append(sc.reshape(1, -1))

        all_scores = np.concatenate(score, axis=0)

        # search maximum for each sample
        ind = np.argmax(all_scores, axis=0)
        # majority-vote
        vote = np.bincount(ind, minlength=len(ex_cls)).reshape(1, -1)

        # or sum-vote
        sumi = np.sum(all_scores, axis=1).reshape(1, -1)

        progress.update(i + 1)
        return vote, sumi
示例#2
0
        def proj(i):
            # n_samples x n_features
            if not isinstance(args.inputfolder, basestring) and \
               len(args.inputfolder) > 1 or args.inputfolders_suffix != '':
                cur_data = pc.loadMultipleDescriptors(files[i])
                if i == 0:
                    print 'loaded descs of', files[i]
                    print 'shape:', cur_data.shape
            else:
               cur_data = pc.loadDescriptors(files[i])

            if args.mode == 'fit':
                prep.partial_fit(cur_data)
                progress.update(i+1)
                return

            else:
                if i == 0:
                    print 'before:'
                    print cur_data[0]
                    print cur_data.shape, cur_data.dtype

                cur_data = prep.transform(cur_data)

                if i == 0:
                    print 'after:'
                    print cur_data[0,0:min(128,cur_data.shape[1])]
                    print cur_data.shape, cur_data.dtype

            fname = files[i] if isinstance(files[i], basestring)\
                    else files[i][0]

            if os.path.isdir(cp):
                fname = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                name = fname.replace('.pkl.gz','')
            else:
                name = os.path.splitext(fname)[0]

            if os.path.isdir(cp):
                pc.mkdir_p(os.path.join(args.outputfolder,
                    os.path.dirname(name)), silent=True)

            name = os.path.join(args.outputfolder, name  + '_pr.pkl.gz')
#            print fname, '-->', name
            with gzip.open(name, 'wb') as F:
                cPickle.dump(cur_data, F, -1)
            progress.update(i+1)
示例#3
0
 def extract(i):
     descr = pc.loadDescriptors(files[i])
     of = os.path.join(
         args.outputfolder,
         os.path.basename(files[i]).split('.', 1)[0] + '_stat.pkl.gz')
     if args.load_stats and os.path.exists(of):
         N, F = pc.load(of)
     else:
         N, F = compute_bw_stats.compute_bw_stats(descr, ubm, None,
                                                  args.nbest)
         pc.dump(of, [N, F], verbose=False)
     if i == 0:
         print N.shape, F.shape
     progress.update(i + 1)
     return N.reshape(1, -1), F.reshape(1, -1)
示例#4
0
    def encode(i):
        if isinstance(descriptor_files[i], basestring):
            base = os.path.basename(os.path.splitext(descriptor_files[i])[0])
        else:
            base = os.path.basename(os.path.commonprefix(descriptor_files[i]))

        gmm_name = base + '_gmm.pkl.gz'
        gmm = ubm_gmm

        # load encoding
        if args.load_scores:
            filepath = os.path.join(args.load_scores,
                                    base + identifier + '.pkl.gz')
            if os.path.exists(filepath):
                with gzip.open(filepath, 'rb') as f:
                    enc = cPickle.load(f)
                    return enc

        # load data and preprocess
        features = pc.loadDescriptors(descriptor_files[i],
                                      hellinger=args.hellinger,
                                      min_descs_per_file=args.min_descs,
                                      show_progress=True)
        if features is None:
            print 'WARNING: features==None ?!'
            progress.update(i + 1)
            return 0.0

        # make the actual encoding step
        enc = encodeGMM(args.encoding,
                        gmm,
                        features,
                        normalize=args.normalize,
                        update=args.update,
                        relevance=args.relevance)

        # save encoding
        filepath = os.path.join(args.outputfolder,
                                base + identifier + '.pkl.gz')
        with gzip.open(filepath, 'w') as f:
            cPickle.dump(enc, f, -1)

        progress.update(i + 1)

        if args.no_eval:  # save some memory
            return None
        return enc
    def predictProbe(i):
        probe_desc = pc.loadDescriptors(files_probe[i])
        if prep:
            if i == 0:
                print 'pre descr[0]', probe_desc[0]
            probe_desc = prep.transform(probe_desc)
            if i == 0:
                print 'post descr[0]', probe_desc[0]
        if ex_cls_bg:  # then use cls as attributes
            probe_desc = exemplar_cls.predictExemplarCls(probe_desc, ex_cls_bg)


#            probe_desc = convertToProbs(probe_desc, ab_list)
        df = exemplar_cls.predictExemplarCls(probe_desc, ex_cls)
        #        df = convertToProbs(df, ab_list)
        #        df = exemplar_cls.voteCls(df)
        progress.update(i + 1)
        return df
示例#6
0
    def encode(i):
        if isinstance(descriptor_files[i], basestring):            
            base = os.path.basename(os.path.splitext(descriptor_files[i])[0])
        else:
            base = os.path.basename(os.path.commonprefix(descriptor_files[i]))

        gmm_name = base + '_gmm.pkl.gz'         
        gmm = ubm_gmm
        
        # load encoding
        if args.load_scores:
            filepath = os.path.join(args.load_scores, base + identifier + '.pkl.gz')
            if os.path.exists(filepath):
                with gzip.open(filepath, 'rb') as f:
                    enc = cPickle.load(f)
                    return enc

        # load data and preprocess
        features = pc.loadDescriptors( descriptor_files[i],
                                      hellinger=args.hellinger,
                                      min_descs_per_file=args.min_descs,
                                      show_progress= True)
        if features is None:
            print 'WARNING: features==None ?!'
            progress.update(i+1)
            return 0.0
        
        # make the actual encoding step
        enc = encodeGMM(args.encoding, gmm, features, 
                             normalize=args.normalize, 
                             update=args.update, relevance=args.relevance )

        # save encoding
        filepath = os.path.join(args.outputfolder, base + identifier + '.pkl.gz')
        with gzip.open(filepath, 'w') as f:
            cPickle.dump(enc, f, -1)

        progress.update(i+1)
        
        if args.no_eval: # save some memory
            return None
        return enc
示例#7
0
                pval = (np.sum(diffs > observed_diff) +
                        np.sum(diffs < -observed_diff)) / float(num_samples)
                #return pval, observed_diff, diffs
                return pval

            print 'permutation test', permutation_resampling(
                s1, s2, 10000, np.mean)

        sys.exit(0)

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                labelfile=args.labelfile,
                                inputfolders_suffix=args.inputfolders_suffix)
    if args.fusion == 'early':
        descriptors = [pc.loadDescriptors(files)]
        print 'loaded descriptor(s), shape:', descriptors[0].shape
    else:
        raise ValueError('currently no other fusion than <early> allowed!')

    # concatenate all possible features
#    if len(args.inputfolder) > 1 or args.inputfolders_suffix != '':
#
#        descriptors, labels, all_files = pc.loadAllDescriptors(args.inputfolder,
#                                                    args.inputfolders_suffix,
#                                                    args.suffix, args.labelfile,
#                                                    1 if args.fusion == 'early' else None)
# TODO: this is unlogic: should be args.labelfile_gallery ...
    if args.labelfile_probe:
        if args.inputfolders_probe:
            probe_inputfolders = args.inputfolders_probe
示例#8
0
    def encode(i):
        if isinstance(descriptor_files[i], basestring):
            fname = descriptor_files[i]
            if os.path.isdir(cp):
                base = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                base = base.replace('.pkl.gz','')
            else:
                base = os.path.splitext(base)[0]

            if os.path.isdir(cp):
                folder = os.path.join(args.outputfolder,
                    os.path.dirname(base))
                # print 'should create: {} + {}'.format(args.outputfolder, base)
                pc.mkdir_p(folder,silent=True)
        else:
            base = os.path.basename(os.path.commonprefix(descriptor_files[i]))

        gmm_name = base + ('_gmm.pkl.gz' if not 'bob' in args.lib else '_gmm_bob.hdf5')
        gmm = ubm_gmm

        scribe_gmm = None
        # load gmm if possible
        if args.load_gmm:
            gmm_file = os.path.join(args.load_gmm, gmm_name)
            scribe_gmm = load_gmm(gmm_file, args.lib)

        # load encoding
        if args.load_scores:
            if args.load_scores == 'outputfolder':
                load_f = args.outputfolder
            else:
                load_f = args.load_scores

            filepath = os.path.join(load_f, base + identifier + '.pkl.gz')
            if os.path.exists(filepath):
                with gzip.open(filepath, 'rb') as f:
                    enc = cPickle.load(f)
                    return enc, None
#            else:
#                print ('WARNING: encoding {} doesnt exist, compute'
#                        'it'.format(filepath ))


        if args.concat_later:
            enc = []
            for k in range(len(descriptor_files[i])):
                # load data and preprocess
                features = pc.loadDescriptors( descriptor_files[i][k],
                                      min_descs_per_file=args.min_descs, show_progress=(False if\
                                                                  args.concat else True))
                if features is None:
                    print 'features==None'
                    continue
                features = prep.transform(feature)

                enc_ = encoder.encode(features)
                enc.append(enc_)
            enc = np.concatenate(enc, axis=0)

        else:
            # load data and preprocess
            features = pc.loadDescriptors( descriptor_files[i],
                                          min_descs_per_file=args.min_descs,
                                          show_progress=(False if\
                                                         args.concat else
                                                         True)#,
                                         )
            posteriors = None
            if args.posteriors_dir:
                posteriors = pc.loadDescriptors( posterior_files[i] )
                assert(len(posteriors) == len(features))
            if not isinstance(features, np.ndarray) and not features:
                print 'features==None?'
                progress.update(i+1)
                return 0.0, None

            if i == 0:
                print '0-shape:',features.shape
            features = prep.transform(features)
            if i == 0:
                print '0-shape (possibly after pca):',features.shape

            if args.maskfolder:
                sample_weights = pc.loadDescriptors(maskfiles[i])
            else:
                sample_weights = None
            enc, scribe_gmm = encoder.encode(features, return_gmm=True,
                                             sample_weights=sample_weights,
                                             posteriors=posteriors,
                                             verbose=True if i == 0 else False)
            if i == 0:
                print '0-enc-shape', enc.shape
                if isinstance(sample_weights, np.ndarray):
                    print 'sample-weights shape:', sample_weights.shape
            # write
            if args.save_gmm:
                scribe_gmm_filename = os.path.join(args.outputfolder, gmm_name)
                if 'bob' in args.lib:
                    scribe_gmm.save( bob.io.HDF5File(scribe_gmm_filename, 'w') )
                else:
                    with gzip.open(scribe_gmm_filename, 'wb') as f:
                        cPickle.dump(scribe_gmm, f, -1)
                pc.verboseprint('wrote', scribe_gmm_filename)
                progress.update(i+1)

        if args.pq and args.load_pq:
            enc = prep.compress(enc, aug=args.aug)

        # save encoding
        filepath = os.path.join(args.outputfolder,
                                base + identifier + ('_pq' if\
                                args.pq else '') + '.pkl.gz')
        with gzip.open(filepath, 'wb') as f:
            cPickle.dump(enc, f, -1)

        progress.update(i+1)
        if 'nothing' in args.evaluate:
            return None, None
        return enc, scribe_gmm
示例#9
0
def runHelper(prep, args):

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                labelfile=args.labelfile, exact=args.exact,
                                inputfolders_suffix=args.inputfolders_suffix,
                               max_files=args.max_files)
    print 'process {} files'.format(len(files))
    widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ',
               progressbar.ETA()]


    if args.load_all_features:
        cur_data, index_list = pc.loadDescriptors(files,
                                                  max_descs=args.max_descriptors[0]\
                                                  if args.max_descriptors\
                                                  else 0,
                                                  return_index_list=True)

        # per descriptor labels:
        if len(index_list)-1 != len(labels):
            raise ValueError('{} != {} + 1'.format(len(index_list),
                                                   len(labels)))
        le = preprocessing.LabelEncoder()
        labels = le.fit_transform(labels)
        desc_labels = np.zeros( len(cur_data), dtype=np.uint32)
        for r in xrange(len(labels)):
            desc_labels[index_list[r]:index_list[r+1]] = labels[r]

        print 'loaded all', cur_data.shape
        if 'transform' in args.mode and args.mode != 'fit_transform':
            print 'first feature before:', cur_data[0]
            print 'dimension before:', cur_data.shape[1], cur_data.dtype
            cur_data = prep.transform(cur_data)
            print 'first feature after:', cur_data[0]
            print 'dimension after:', cur_data.shape[1], cur_data.dtype

        if 'fit' in args.mode:
            if 'transform' in args.mode and args.strip_aug:
                prep.strip_aug = False
            prep.fit(cur_data, labels=desc_labels)

            if args.mode == 'fit_transform':
                cur_data = prep.transform(cur_data)

    else:
        progress = progressbar.ProgressBar(widgets=widgets,
                                       maxval=len(files))

        if any(isinstance(f, tuple) for f in files):
            files1 = [f for f in zip(*files)[0]]
            cp = os.path.commonprefix(files1)
        else:
            cp = os.path.commonprefix(files)

        def proj(i):
            # n_samples x n_features
            if not isinstance(args.inputfolder, basestring) and \
               len(args.inputfolder) > 1 or args.inputfolders_suffix != '':
                cur_data = pc.loadMultipleDescriptors(files[i])
                if i == 0:
                    print 'loaded descs of', files[i]
                    print 'shape:', cur_data.shape
            else:
               cur_data = pc.loadDescriptors(files[i])

            if args.mode == 'fit':
                prep.partial_fit(cur_data)
                progress.update(i+1)
                return

            else:
                if i == 0:
                    print 'before:'
                    print cur_data[0]
                    print cur_data.shape, cur_data.dtype

                cur_data = prep.transform(cur_data)

                if i == 0:
                    print 'after:'
                    print cur_data[0,0:min(128,cur_data.shape[1])]
                    print cur_data.shape, cur_data.dtype

            fname = files[i] if isinstance(files[i], basestring)\
                    else files[i][0]

            if os.path.isdir(cp):
                fname = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                name = fname.replace('.pkl.gz','')
            else:
                name = os.path.splitext(fname)[0]

            if os.path.isdir(cp):
                pc.mkdir_p(os.path.join(args.outputfolder,
                    os.path.dirname(name)), silent=True)

            name = os.path.join(args.outputfolder, name  + '_pr.pkl.gz')
#            print fname, '-->', name
            with gzip.open(name, 'wb') as F:
                cPickle.dump(cur_data, F, -1)
            progress.update(i+1)

        progress.start()
        # FIXME: np.dot (e.g. used for (R)PCA) doesnt work in parallel atm
#        if args.parallel:
#            pc.parmap(proj, range(len(files)), args.nprocs)
#        else:
        map(proj, range(len(files)))
        progress.finish()

    prep.save_trafos(args.outputfolder)
示例#10
0
def createExemplarClsFromFile(ex_file,
                              b_files,
                              cls,
                              clsname='sgd',
                              subfolds=1,
                              average=False,
                              weights=(0.5, 0.01)):
    """
    parameters:
        ex_descr: descriptor(s) for which to make an exemplar-classifier
        b_files: files containing the negative descriptors
        cls: the classifier base class
    returns: the exemplar classifier
    """

    # load descriptors to compute an exemplar classifier for
    # == the positive class
    ex_desc = pc.loadDescriptors(ex_file)
    if average:
        ex_desc = np.mean(ex_desc, axis=0).reshape(1, -1)

    if clsname == 'sgd' and subfolds > 1:
        file_groups = np.array_split(b_files, subfolds)
        ex_desc_splits = np.array_split(ex_desc, subfolds)
    elif average:
        file_groups = [b_files]
        ex_desc_splits = [ex_desc]

    if (clsname == 'sgd' and subfolds > 1) or average:
        cls = copy.deepcopy(cls)
        # training part
        for e, cur_files in enumerate(file_groups):
            cur_data = [ex_desc_splits[e]]
            cur_labels = [1] * ex_desc_splits[e].shape[0]

            # insert negatives from background files
            for f in range(len(cur_files)):
                temp_data = pc.loadDescriptors(cur_files[f])
                if temp_data == None:
                    print 'couldnt load', f
                    continue
                if args.average:
                    temp_data = np.mean(temp_data, axis=0).reshape(1, -1)
                cur_data.append(temp_data)
                cur_labels.extend([0] * temp_data.shape[0])

            cur_data = np.concatenate(cur_data, axis=0)

            sample_weight = [weights[0]] * ex_desc.shape[0]
            sample_weight.extend([weights[1]] *
                                 (len(cur_labels) - ex_desc.shape[0]))
            if args.clsname == 'sgd':
                cls.partial_fit(cur_data,
                                cur_labels,
                                classes=[1, 0],
                                sample_weight=sample_weight)
            else:
                cls.fit(cur_data, cur_labels, sample_weight=sample_weight)

            del cur_data, cur_labels
    # faster process:
    else:
        neg_desc = pc.loadDescriptors(b_files)
        createExemplarCls(ex_desc, neg_desc, cls, weights)

    return cls
示例#11
0

def runNN(descriptors, labels, parallel, nprocs):
    """
    compute nearest neighbor from specific descriptors, given labels
    """

    distance_method = {"cosine": 'cosine'}
    ret_matrix = None
    for name, method in distance_method.iteritems():
        dist_matrix = computeDistances(descriptors, method, parallel, nprocs)

        computeStats(name, dist_matrix, labels, parallel)
        ret_matrix = dist_matrix

    return ret_matrix


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Evaluate stuff")
    parser = pc.commonArguments(parser)
    args = parser.parse_args()

    descr_files, labels = pc.getFiles(args.inputfolder,
                                      args.suffix,
                                      args.labelfile,
                                      exact=True)
    descriptors = pc.loadDescriptors(descr_files)

    ret_matrix = runNN(descriptors, labels, args.parallel)
示例#12
0
        pc.mkdir_p(args.outputfolder)

    files, _ = pc.getFiles(args.inputfolder,
                           args.suffix,
                           args.labelfile,
                           exact=True)
    if not files or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    # load features to train a universal background gmm
    print 'load features for training ubm from {} files'.format(len(files))

    descriptors = pc.loadDescriptors(files,\
                                     max_descs=args.max_descriptors[0],
                                     max_descs_per_file=max(int(args.max_descriptors[0]/len(files)),\
                                                            1),
                                     rand=True, \
                                     hellinger=args.hellinger)
    print 'got {} features'.format(len(descriptors))
    print 'features.shape', descriptors.shape

    vocabulary = computeVocabulary(descriptors, args.method, args.num_clusters,
                                   args.iterations, args.update,
                                   args.covar_type)

    # save gmm
    voc_filepath = os.path.join(args.outputfolder, args.vocabulary_filename +\
                                '.pkl.gz')
    with gzip.open(voc_filepath, 'wb') as f:
        cPickle.dump(vocabulary, f, -1)
        print 'saved vocabulary at', voc_filepath
def run(args, prep, write_stats=False):
    # create (or load) for each file an exemplar classifier
    # using the rest of the files as background class
    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                labelfile=args.labelfile)
    # all labels should differ!
    assert (len(set(labels)) == len(labels))

    # if we use classifiers as attributes then we need
    # background-classifiers independent from the training set
    if args.attribute: assert (args.bi)

    # additional background descriptors
    if len(args.bi) > 0:
        assert (len(args.bi) == len(args.bl))
        bg_files, bg_labels = pc.getFiles(args.bi,
                                          args.suffix,
                                          labelfile=args.bl,
                                          concat=True)
        #        bg_files = []
        #        bg_labels = []
        #        for e,bi in enumerate(args.bi):
        #            tmp_bg_files, tmp_bg_labels = pc.getFiles(bi, args.suffix,
        #                                      labelfile=args.bl[e])
        # Don't need this assert since the background labels are allowed
        # to appear multiple times
        #            assert( len(list(set(tmp_bg_labels))) == len(tmp_bg_labels) )
        #            bg_files.extend(tmp_bg_files)
        #            bg_labels.extend(tmp_bg_labels)

        #        assert( len(list(set(bg_labels+labels))) == len(bg_labels+labels) )
        assert (len(set(labels).intersection(set(bg_labels))) == 0)

    ex_cls = []
    if args.load_ex_cls:
        for f in files:
            ex_cls.append(pc.load(f))
    else:
        if (not args.scale and not args.load_trafo == 'scaler') and\
           ('svm' in args.clsname or args.clsname == 'sgd'):
            print 'WARNING: svm or sgd chosen but not --scale!'

        all_cls = args.func(args)
        if not all_cls:
            raise ValueError('no classifier given')
        the_cls = all_cls[0]

        print 'load:', args.inputfolder
        descr = pc.loadDescriptors(files)
        print 'shape:', descr.shape
        if len(args.bi) > 0:
            print 'load descriptors of: ' + ','.join(args.bi)
            descr_bg = pc.loadDescriptors(bg_files)
            print 'shape:', descr_bg.shape
            if not args.attribute:
                descr = np.concatenate([descr, descr_bg], axis=0)
                print 'concat shape:', descr.shape

        print 'pre descr[0]', descr[0]
        print 'fit-transform'
        descr = prep.fit_transform(descr)
        print 'post descr[0]', descr[0]
        print 'possible new shape:', descr.shape
        prep.save_trafos(args.outputfolder)

        if args.attribute:
            descr_bg = prep.transform(descr_bg)
            print 'compute attribute space, dim=', len(descr_bg)
            ex_cls_bg = computeExCls(descr_bg,
                                     the_cls,
                                     len(descr_bg),
                                     args.outputfolder,
                                     bg_labels,
                                     '_attr.pkl.gz',
                                     parallel=args.parallel)
            descr = exemplar_cls.predictExemplarCls(descr, ex_cls_bg)
            # platt calibration
            #            ab_list = computeAB(descr_bg, ex_cls_bg, bg_labels)
            #            descr = convertToProbs(descr, ab_list)
            print 'new descr-shape:', descr.shape

        ex_cls = computeExCls(descr,
                              the_cls,
                              len(files),
                              args.outputfolder,
                              labels,
                              parallel=args.parallel)

        # platt calibration


#        ab_list = computeAB(descr, ex_cls, labels)

    print 'load test:', args.pi
    files_probe, labels_probe = pc.getFiles(args.pi,
                                            args.suffix,
                                            labelfile=args.pl)

    print 'predict now'
    scores = predict(files_probe, ex_cls, prep, parallel=args.parallel)
    # this is our scores-matrix
    scores_mat = np.concatenate(scores, axis=0)
    stats = evaluate.computeStats('sum/max',
                                  scores_mat,
                                  labels_probe,
                                  labels,
                                  distance=False,
                                  parallel=args.parallel)
    if write_stats:
        evaluate.write_stats(os.path.join(args.outputfolder, 'stats.txt'),
                             stats)
示例#14
0
def run(args, prep=None):
    if prep == None:
        prep = preprocess.Preprocess()
    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                exact=args.exact,
                                max_files=args.max_files)
    if files is None or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    maskfiles = pc.getMaskFiles(files, args.suffix, args.maskfolder,
                                args.masksuffix)
    if len(args.max_descriptors) == 0:
        descriptors, rand_indices = pc.loadDescriptors(
            files, rand=True, return_random_indices=True)
    else:
        max_descs_per_file = int(args.max_descriptors[0] / float(len(files)))
        max_descs_per_file = max(max_descs_per_file, 1)
        descriptors, rand_indices = pc.loadDescriptors(files,\
                                                        max_descs=args.max_descriptors[0],
                                                        max_descs_per_file=max_descs_per_file,
                                                        rand=True,
                                                        maskfiles=maskfiles,
                                                        return_random_indices=True)

    print 'got {} features'.format(len(descriptors))
    print 'features.shape', descriptors.shape

    # load features to train a universal background gmm
    print 'load features for training ubm from {} files'.format(len(files))

    if args.method == 'posteriors':
        posteriors_files, _ = pc.getFiles(args.posteriors_dir,
                                          args.posteriors_suffix,
                                          labelfile=args.labelfile,
                                          exact=args.exact,
                                          max_files=args.max_files)
        assert (len(posteriors_files) == len(files))
        indices = []

        widgets = [
            progressbar.Percentage(), ' ',
            progressbar.Bar(), ' ',
            progressbar.ETA()
        ]
        progress = progressbar.ProgressBar(widgets=widgets,
                                           maxval=len(posteriors_files))
        progress.start()
        for e, f in enumerate(posteriors_files):
            posteriors = pc.loadDescriptors(f)
            posteriors = posteriors[rand_indices[e]]
            cluster_idx = posteriors.argmax(axis=1)
            indices.append(cluster_idx)
            progress.update(e + 1)
        progress.finish()

        indices = np.concatenate(indices)
        assert (len(indices) == len(descriptors))
        means = recomputeMeans(descriptors, indices)
        vocabulary = cluster.KMeans(means.shape[0])  # dummy
        vocabulary.means_ = means
        vocabulary.type_ = 'kmeans'
    else:
        vocabulary = computeVocabulary(descriptors, args.method,
                                       args.num_clusters, args.iterations,
                                       args.gmm_update, args.lib,
                                       args.covar_type, args.nprocs)

    # TODO: rewrite to be more generic
    if 'sparse' in args.method and 'gmm' in args.method:
        gmm = mixture.GMM(args.num_clusters,
                          n_iter=args.iterations,
                          params=args.gmm_update,
                          init_params='wc')
        gmm.means_ = vocabulary.reshape(args.num_clusters, -1)
        gmm.fit(descriptors)
        vocabulary = gmm

    if args.predict:
        pred = vocabulary.predict(descriptors)
        pred_prob = None
        if 'predict_proba' in dir(vocabulary):
            pred_prob = vocabulary.predict_proba(descriptors)
        for i, f in enumerate(files):
            if pred_prob:
                print '{}\t[{}], ([{}])'.format(os.path.basename(f), pred[i],
                                                pred_prob[i])
            else:
                print '{}\t[{}]'.format(os.path.basename(f), pred[i])

    # save gmm
    voc_filepath = os.path.join(
        args.outputfolder,
        (args.vocabulary_filename
         if args.vocabulary_filename != None else args.method) + 'pkl.gz')
    with gzip.open(voc_filepath, 'wb') as f:
        cPickle.dump(vocabulary, f, -1)
    print 'saved vocabulary at', voc_filepath

    if args.method == 'gmm':
        try:
            aic = vocabulary.aic(descriptors)
            print 'aic:', aic
            with open(os.path.join(args.outputfolder, 'aic.txt'), 'a') as f:
                f.write('{}\n'.format(aic))
        except:
            raise


#            print('couldnt compute aic, error: {}'.format(e))

    return os.path.abspath(voc_filepath)
示例#15
0
    def exemplar_classify(i):
        cls = copy.deepcopy(the_cls)
        # load descriptors to compute an exemplar classifier for
        # == the positive class
        if args.pq:
            ex_desc = prep.uncompress(pos_desc[i])
        else:
            ex_desc = pc.loadDescriptors(files[i])
        if args.average:
            ex_desc = np.mean(desc, axis=0).reshape(1, -1)

        if args.clsname == 'sgd' and args.subfolds > 1:
            file_groups = np.array_split(b_files, args.subfolds)
            ex_desc_splits = np.array_split(ex_desc, args.subfolds)
        else:
            file_groups = [b_files]
            ex_desc_splits = [ex_desc]

        # training part
        for e, cur_files in enumerate(file_groups):
            cur_data = [ex_desc_splits[e]]
            cur_labels = [1] * ex_desc_splits[e].shape[0]

            # insert negatives from background files
            for f in range(len(cur_files)):
                if args.pq:
                    temp_data = prep.uncompress(neg_desc[f])
                else:
                    temp_data = pc.loadDescriptors(cur_files[f])
                    #                                                   max_descs_per_file=max_descs)
                    if temp_data == None:
                        print 'couldnt load', f
                        continue
                if args.average:
                    temp_data = np.mean(temp_data, axis=0).reshape(1, -1)
                cur_data.append(temp_data)
                cur_labels.extend([0] * temp_data.shape[0])

            cur_data = np.concatenate(cur_data, axis=0)
            #            print 'cur_data', cur_data.shape, cur_data.dtype

            cur_data = prep.transform(cur_data)

            sample_weight = [0.5] * ex_desc.shape[0]
            sample_weight.extend([0.01] * (len(cur_labels) - ex_desc.shape[0]))
            if args.clsname == 'sgd':
                cls.partial_fit(cur_data,
                                cur_labels,
                                classes=[0, 1],
                                sample_weight=sample_weight)
            else:
                cls.fit(cur_data, cur_labels, sample_weight=sample_weight)

            del cur_data, cur_labels

#                filename = os.path.join(args.outputfolder, args.clsname) +'.pkl.gz'
#                with gzip.open(filename, 'wb') as fOut:
#                    cPickle.dump(cls, fOut, -1)
#                    print 'saved', filename
        progress.update(i + 1)
        return cls
示例#16
0
    print "NN {:10} TOP-1: {:7}  mAP: {:12}".format(name, top1, mAP)
    
    return top1, mAP

def runNN(descriptors, labels, parallel, nprocs):
    """
    compute nearest neighbor from specific descriptors, given labels
    """

    distance_method = { "cosine": 'cosine' }
    ret_matrix = None
    for name, method in distance_method.iteritems():
        dist_matrix = computeDistances(descriptors, method, 
                                           parallel, nprocs)

        computeStats(name, dist_matrix, labels, parallel)
        ret_matrix = dist_matrix

    return ret_matrix 

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Evaluate stuff")
    parser = pc.commonArguments(parser)
    args = parser.parse_args()

    descr_files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile,
                                 exact=True)
    descriptors = pc.loadDescriptors(descr_files)

    ret_matrix = runNN( descriptors, labels, args.parallel )
示例#17
0
def run(args):
    print '> compute LCS'
    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                exact=args.exact)
    if len(args.max_descriptors) == 0:
        descriptors, index_list = pc.loadDescriptors(files,
                                                     rand=True,
                                                     return_index_list=1)
    else:
        descriptors, index_list = pc.loadDescriptors(files,\
                                         max_descs=args.lcs_max_descriptors,
                                         max_descs_per_file=max(int(args.lcs_max_descriptors/len(files)),\
                                                                1),
                                         rand=True,
                                        return_index_list=1)
        print 'descriptors.shape', descriptors.shape
#        #if not args.inputfolders:
#        cur_data, index_list = pc.loadDescriptors(files,
#                                                  max_descs=args.max_descriptors[0]\
#                                                  if args.max_descriptors\
#                                                  else 0,
#                                                  return_index_list=True)

# per descriptor labels:
    if len(index_list) - 1 != len(labels):
        raise ValueError('{} != {} + 1'.format(len(index_list), len(labels)))
    le = preprocessing.LabelEncoder()
    labels = le.fit_transform(labels)
    desc_labels = np.zeros(len(descriptors), dtype=np.uint32)
    for r in xrange(len(labels)):
        desc_labels[index_list[r]:index_list[r + 1]] = labels[r]

    prep = preprocess.Preprocess(args)

    ubm = ubm_adaption.loadGMM(args.load_ubm)
    if not args.no_assignment:
        assignments = encoding.getAssignment(ubm.means_, descriptors)
    lcs = []
    descr = []
    # Note: we could also compute the LCS afterwards using 'multipca' option
    # of preprocess...
    for i in range(len(ubm.means_)):
        if args.no_assignment:
            diff = descriptors - ubm.means_[i]
        else:
            for_lcs = descriptors[assignments[:, i] > 0]
            diff = for_lcs - ubm.means_[i]
        if args.resnorm:
            diff = preprocessing.normalize(diff, norm='l2', copy=False)
        if not args.global_cs:
            prep.fit(diff, desc_labels[assignments[:, i] > 0])
            lcs.append(copy.deepcopy(prep.pca))
            prep.pca = None
        else:
            descr.append(diff)

    if args.global_cs:
        print '> compute global lcs'
        diff = np.concatenate(descr, axis=1)
        print '... from descr.shape', diff.shape
        prep.fit(diff, desc_labels)
        print '< compute global lcs'
        lcs = copy.deepcopy(prep.pca)
        prep.pca = None
    folder = os.path.join(args.outputfolder, 'lcs.pkl.gz')
    pc.dump(folder, lcs)
    return folder
示例#18
0
    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files,_ = pc.getFiles(args.inputfolder, args.suffix, args.labelfile,
                        exact=True)
    if not files or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    # load features to train a universal background gmm
    print 'load features for training ubm from {} files'.format(len(files))

    descriptors = pc.loadDescriptors(files,\
                                     max_descs=args.max_descriptors[0],
                                     max_descs_per_file=max(int(args.max_descriptors[0]/len(files)),\
                                                            1), 
                                     rand=True, \
                                     hellinger=args.hellinger) 
    print 'got {} features'.format(len(descriptors))
    print 'features.shape', descriptors.shape

    vocabulary = computeVocabulary(descriptors, args.method, args.num_clusters, 
                                   args.iterations, args.update,
                                   args.covar_type) 

    # save gmm
    voc_filepath = os.path.join(args.outputfolder, args.vocabulary_filename +\
                                '.pkl.gz')
    with gzip.open(voc_filepath, 'wb') as f:
        cPickle.dump(vocabulary, f, -1)
        print 'saved vocabulary at', voc_filepath
示例#19
0
    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                concat=True)
    print 'n-files:', len(files)

    le = preprocessing.LabelEncoder()
    labels = le.fit_transform(labels)

    desc_files, _ = pc.getFiles(args.df, args.ds, args.labelfile, concat=True)

    kmeans = pc.load(args.cluster)
    means = kmeans.means_

    print files[0], desc_files[0]
    dummy_desc = pc.loadDescriptors(files[0])
    dummy_desc2 = pc.loadDescriptors(desc_files[0])
    assert (dummy_desc.shape[0] == dummy_desc2.shape[0])

    print 'descr.shape:', dummy_desc.shape
    desc = np.zeros((args.max_descriptors[0], dummy_desc2.shape[1]),
                    dtype=np.float32)
    labels_out = np.zeros((args.max_descriptors[0], 1), dtype=np.float32)
    labels_real = np.zeros((args.max_descriptors[0], 1), dtype=np.float32)
    max_descs_per_file = args.max_descriptors[0] / len(files)

    cluster_idx = []
    i = 0
    visited_files = {}
    no_new = False
    while i < args.max_descriptors[0]:
示例#20
0
    if not all_cls:
        print 'no classifier given'

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                labelfile=args.labelfile)
    files = np.array(files)
    labels = np.array(labels)

    # these are our background / negative training files
    b_files, b_labels = pc.getFiles(args.bi,
                                    args.bs if args.bs else args.suffix,
                                    labelfile=args.bl)

    # let's first test shapes
    test_f = pc.loadDescriptors(files[0])
    b_test_f = pc.loadDescriptors(b_files[0])
    assert (test_f.shape[1] == b_test_f.shape[1])
    print 'descriptor-dimension:', test_f.shape[1]

    # let's shuffle them
    shuffle_ids = np.arange(len(b_files))
    np.random.shuffle(shuffle_ids)
    b_files = np.array(b_files)[shuffle_ids]
    b_labels = np.array(b_labels)[shuffle_ids]

    prep = preprocess.Preprocess(\
                                 pca_components = args.pca_components,
                                 normalize=args.normalize\
                                )