예제 #1
0
        def proj(i):
            # n_samples x n_features
            if not isinstance(args.inputfolder, basestring) and \
               len(args.inputfolder) > 1 or args.inputfolders_suffix != '':
                cur_data = pc.loadMultipleDescriptors(files[i])
                if i == 0:
                    print 'loaded descs of', files[i]
                    print 'shape:', cur_data.shape
            else:
               cur_data = pc.loadDescriptors(files[i])

            if args.mode == 'fit':
                prep.partial_fit(cur_data)
                progress.update(i+1)
                return

            else:
                if i == 0:
                    print 'before:'
                    print cur_data[0]
                    print cur_data.shape, cur_data.dtype

                cur_data = prep.transform(cur_data)

                if i == 0:
                    print 'after:'
                    print cur_data[0,0:min(128,cur_data.shape[1])]
                    print cur_data.shape, cur_data.dtype

            fname = files[i] if isinstance(files[i], basestring)\
                    else files[i][0]

            if os.path.isdir(cp):
                fname = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                name = fname.replace('.pkl.gz','')
            else:
                name = os.path.splitext(fname)[0]

            if os.path.isdir(cp):
                pc.mkdir_p(os.path.join(args.outputfolder,
                    os.path.dirname(name)), silent=True)

            name = os.path.join(args.outputfolder, name  + '_pr.pkl.gz')
#            print fname, '-->', name
            with gzip.open(name, 'wb') as F:
                cPickle.dump(cur_data, F, -1)
            progress.update(i+1)
예제 #2
0
                        help='points to the cluster file')
    parser.add_argument('--ratio',
                        type=float,
                        help='max ratio 1st to 2nd nearest cluster')
    return parser


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Clustering - Index')
    parser = pc.commonArguments(parser)
    parser = addArguments(parser)
    args = parser.parse_args()
    np.random.seed(42)

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    print args.max_descriptors

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                concat=True)
    print 'n-files:', len(files)

    le = preprocessing.LabelEncoder()
    labels = le.fit_transform(labels)

    desc_files, _ = pc.getFiles(args.df, args.ds, args.labelfile, concat=True)

    kmeans = pc.load(args.cluster)
예제 #3
0
    def encode(i):
        if isinstance(descriptor_files[i], basestring):
            fname = descriptor_files[i]
            if os.path.isdir(cp):
                base = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                base = base.replace('.pkl.gz','')
            else:
                base = os.path.splitext(base)[0]

            if os.path.isdir(cp):
                folder = os.path.join(args.outputfolder,
                    os.path.dirname(base))
                # print 'should create: {} + {}'.format(args.outputfolder, base)
                pc.mkdir_p(folder,silent=True)
        else:
            base = os.path.basename(os.path.commonprefix(descriptor_files[i]))

        gmm_name = base + ('_gmm.pkl.gz' if not 'bob' in args.lib else '_gmm_bob.hdf5')
        gmm = ubm_gmm

        scribe_gmm = None
        # load gmm if possible
        if args.load_gmm:
            gmm_file = os.path.join(args.load_gmm, gmm_name)
            scribe_gmm = load_gmm(gmm_file, args.lib)

        # load encoding
        if args.load_scores:
            if args.load_scores == 'outputfolder':
                load_f = args.outputfolder
            else:
                load_f = args.load_scores

            filepath = os.path.join(load_f, base + identifier + '.pkl.gz')
            if os.path.exists(filepath):
                with gzip.open(filepath, 'rb') as f:
                    enc = cPickle.load(f)
                    return enc, None
#            else:
#                print ('WARNING: encoding {} doesnt exist, compute'
#                        'it'.format(filepath ))


        if args.concat_later:
            enc = []
            for k in range(len(descriptor_files[i])):
                # load data and preprocess
                features = pc.loadDescriptors( descriptor_files[i][k],
                                      min_descs_per_file=args.min_descs, show_progress=(False if\
                                                                  args.concat else True))
                if features is None:
                    print 'features==None'
                    continue
                features = prep.transform(feature)

                enc_ = encoder.encode(features)
                enc.append(enc_)
            enc = np.concatenate(enc, axis=0)

        else:
            # load data and preprocess
            features = pc.loadDescriptors( descriptor_files[i],
                                          min_descs_per_file=args.min_descs,
                                          show_progress=(False if\
                                                         args.concat else
                                                         True)#,
                                         )
            posteriors = None
            if args.posteriors_dir:
                posteriors = pc.loadDescriptors( posterior_files[i] )
                assert(len(posteriors) == len(features))
            if not isinstance(features, np.ndarray) and not features:
                print 'features==None?'
                progress.update(i+1)
                return 0.0, None

            if i == 0:
                print '0-shape:',features.shape
            features = prep.transform(features)
            if i == 0:
                print '0-shape (possibly after pca):',features.shape

            if args.maskfolder:
                sample_weights = pc.loadDescriptors(maskfiles[i])
            else:
                sample_weights = None
            enc, scribe_gmm = encoder.encode(features, return_gmm=True,
                                             sample_weights=sample_weights,
                                             posteriors=posteriors,
                                             verbose=True if i == 0 else False)
            if i == 0:
                print '0-enc-shape', enc.shape
                if isinstance(sample_weights, np.ndarray):
                    print 'sample-weights shape:', sample_weights.shape
            # write
            if args.save_gmm:
                scribe_gmm_filename = os.path.join(args.outputfolder, gmm_name)
                if 'bob' in args.lib:
                    scribe_gmm.save( bob.io.HDF5File(scribe_gmm_filename, 'w') )
                else:
                    with gzip.open(scribe_gmm_filename, 'wb') as f:
                        cPickle.dump(scribe_gmm, f, -1)
                pc.verboseprint('wrote', scribe_gmm_filename)
                progress.update(i+1)

        if args.pq and args.load_pq:
            enc = prep.compress(enc, aug=args.aug)

        # save encoding
        filepath = os.path.join(args.outputfolder,
                                base + identifier + ('_pq' if\
                                args.pq else '') + '.pkl.gz')
        with gzip.open(filepath, 'wb') as f:
            cPickle.dump(enc, f, -1)

        progress.update(i+1)
        if 'nothing' in args.evaluate:
            return None, None
        return enc, scribe_gmm
예제 #4
0
def run(args, prep=None):
    if prep is None:
        prep = preprocess.Preprocess()

    if not args.labelfile or not args.inputfolder \
       or not args.outputfolder:
        print('WARNING: no labelfile or no inputfolder'
              ' or no outputfolder specified')

    print 'accumulate features:', args.accumulate

    if args.outputfolder and not os.path.exists(args.outputfolder):
        print 'outputfolder doesnt exist -> create'
        pc.mkdir_p(args.outputfolder)

    if args.load_scores:
        print 'try to load computed encodings'


    #####
    # UBM / loading
    print 'load gmm from', args.load_ubm
    ubm_gmm = None
    if args.load_ubm:
        ubm_gmm = loadGMM(args.load_ubm, args.lib)

    #####
    # Enrollment
    # now for each feature-set adapt a gmm
    #####
    if args.labelfile is None:
        print 'WARNING: no label-file'
    if args.concat_later:
        args.concat = True
    if args.concat:
        groups = None

        if args.group_word:
            descriptor_files = pc.getFilesGrouped(args.inputfolder, args.suffix)
            labels = None
        else:
            descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                       args.labelfile, exact=False,
                                       concat=True)
            print 'labels:', labels[0]
            if len(descriptor_files) != len(labels):
                raise ValueError('len(descriptor_files) {} !='
                             'len(labels) {}'.format(len(descriptor_files),
                                                 len(labels)))
        print 'num descr-files of first:', len(descriptor_files[0])

    else:
        descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                               args.labelfile)
    if args.maskfolder:
        maskfiles = pc.getMaskFiles(descriptor_files, args.suffix, args.maskfolder,
                                args.masksuffix)
    if len(descriptor_files) == 0:
        print 'no descriptor_files'
        sys.exit(1)
    if labels:
        num_scribes = len(list(set(labels)))
    else:
        num_scribes = 'unknown'

    num_descr = len(descriptor_files)
    print 'number of classes:', num_scribes
    print 'number of descriptor_files:', num_descr
    print 'adapt training-features to create individual scribe-gmms (or load saved ones)'
    widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ',
               progressbar.ETA()]
    progress = progressbar.ProgressBar(widgets=widgets,
                                       maxval=len(descriptor_files))

    if 'supervector' in args.encoding:
        identifier = '_sv'
    elif 'fisher' in args.encoding:
        identifier = '_fv'
    else:
        identifier = '_' + args.encoding

    identifier += '_' + args.update
    if len(args.normalize_enc) > 0:
        identifier += '_' + '_'.join(args.normalize_enc)

    encoder = Encoding(args.encoding, ubm_gmm, parallel=False,
                       normalize=args.normalize_enc, update=args.update,
                       relevance=args.relevance, nbest=args.nbest,
                       ratio=args.ratio,
                       accumulate=args.accumulate,
                       nprocs=args.nprocs)

    if args.posteriors_dir:
        posterior_files, _ = pc.getFiles(args.posteriors_dir, args.posteriors_suffix,
                                         args.labelfile)
        print len(posterior_files), len(descriptor_files)
        assert(len(posterior_files) == len(descriptor_files))

    cp = os.path.commonprefix(descriptor_files)
    #print cp
    def encode(i):
        if isinstance(descriptor_files[i], basestring):
            fname = descriptor_files[i]
            if os.path.isdir(cp):
                base = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                base = base.replace('.pkl.gz','')
            else:
                base = os.path.splitext(base)[0]

            if os.path.isdir(cp):
                folder = os.path.join(args.outputfolder,
                    os.path.dirname(base))
                # print 'should create: {} + {}'.format(args.outputfolder, base)
                pc.mkdir_p(folder,silent=True)
        else:
            base = os.path.basename(os.path.commonprefix(descriptor_files[i]))

        gmm_name = base + ('_gmm.pkl.gz' if not 'bob' in args.lib else '_gmm_bob.hdf5')
        gmm = ubm_gmm

        scribe_gmm = None
        # load gmm if possible
        if args.load_gmm:
            gmm_file = os.path.join(args.load_gmm, gmm_name)
            scribe_gmm = load_gmm(gmm_file, args.lib)

        # load encoding
        if args.load_scores:
            if args.load_scores == 'outputfolder':
                load_f = args.outputfolder
            else:
                load_f = args.load_scores

            filepath = os.path.join(load_f, base + identifier + '.pkl.gz')
            if os.path.exists(filepath):
                with gzip.open(filepath, 'rb') as f:
                    enc = cPickle.load(f)
                    return enc, None
#            else:
#                print ('WARNING: encoding {} doesnt exist, compute'
#                        'it'.format(filepath ))


        if args.concat_later:
            enc = []
            for k in range(len(descriptor_files[i])):
                # load data and preprocess
                features = pc.loadDescriptors( descriptor_files[i][k],
                                      min_descs_per_file=args.min_descs, show_progress=(False if\
                                                                  args.concat else True))
                if features is None:
                    print 'features==None'
                    continue
                features = prep.transform(feature)

                enc_ = encoder.encode(features)
                enc.append(enc_)
            enc = np.concatenate(enc, axis=0)

        else:
            # load data and preprocess
            features = pc.loadDescriptors( descriptor_files[i],
                                          min_descs_per_file=args.min_descs,
                                          show_progress=(False if\
                                                         args.concat else
                                                         True)#,
                                         )
            posteriors = None
            if args.posteriors_dir:
                posteriors = pc.loadDescriptors( posterior_files[i] )
                assert(len(posteriors) == len(features))
            if not isinstance(features, np.ndarray) and not features:
                print 'features==None?'
                progress.update(i+1)
                return 0.0, None

            if i == 0:
                print '0-shape:',features.shape
            features = prep.transform(features)
            if i == 0:
                print '0-shape (possibly after pca):',features.shape

            if args.maskfolder:
                sample_weights = pc.loadDescriptors(maskfiles[i])
            else:
                sample_weights = None
            enc, scribe_gmm = encoder.encode(features, return_gmm=True,
                                             sample_weights=sample_weights,
                                             posteriors=posteriors,
                                             verbose=True if i == 0 else False)
            if i == 0:
                print '0-enc-shape', enc.shape
                if isinstance(sample_weights, np.ndarray):
                    print 'sample-weights shape:', sample_weights.shape
            # write
            if args.save_gmm:
                scribe_gmm_filename = os.path.join(args.outputfolder, gmm_name)
                if 'bob' in args.lib:
                    scribe_gmm.save( bob.io.HDF5File(scribe_gmm_filename, 'w') )
                else:
                    with gzip.open(scribe_gmm_filename, 'wb') as f:
                        cPickle.dump(scribe_gmm, f, -1)
                pc.verboseprint('wrote', scribe_gmm_filename)
                progress.update(i+1)

        if args.pq and args.load_pq:
            enc = prep.compress(enc, aug=args.aug)

        # save encoding
        filepath = os.path.join(args.outputfolder,
                                base + identifier + ('_pq' if\
                                args.pq else '') + '.pkl.gz')
        with gzip.open(filepath, 'wb') as f:
            cPickle.dump(enc, f, -1)

        progress.update(i+1)
        if 'nothing' in args.evaluate:
            return None, None
        return enc, scribe_gmm

    progress.start()
    if args.parallel:
        all_enc, all_gmms = zip( *pc.parmap( encode, range(num_descr),
                                            args.nprocs, size=num_descr) )
    else:
        all_enc, all_gmms = zip( *map( encode, range(num_descr) ) )
    progress.finish()
    if 'nothing' in args.evaluate:
        print 'nothing to evaluate, exit now'
        return

    print 'got {} encodings'.format(len(all_enc))

    all_enc = np.concatenate(all_enc, axis=0) #.astype(np.float32)

    print 'all_enc.shape', all_enc.shape

    print 'Evaluation:'

    stats = None
    ret_matrix = None

    for eval_method in args.evaluate:

        ret_matrix, stats = evaluate.runNN( all_enc, labels, distance=True, histogram=False,
                                               eval_method=eval_method,
                                               parallel=args.parallel,
                                               nprocs=args.nprocs)

        if ret_matrix is None or not isinstance(ret_matrix,np.ndarray):
            print 'WARNING: ret_matrix is None or not instance of np.ndarray'
        else:
            fpath = os.path.join(args.outputfolder, 'dist' + identifier
                                 + '_' + eval_method + '.cvs')
            np.savetxt(fpath, ret_matrix, delimiter=',')
            print 'saved', fpath
        return stats
예제 #5
0
def runHelper(prep, args):

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files, labels = pc.getFiles(args.inputfolder, args.suffix,
                                labelfile=args.labelfile, exact=args.exact,
                                inputfolders_suffix=args.inputfolders_suffix,
                               max_files=args.max_files)
    print 'process {} files'.format(len(files))
    widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ',
               progressbar.ETA()]


    if args.load_all_features:
        cur_data, index_list = pc.loadDescriptors(files,
                                                  max_descs=args.max_descriptors[0]\
                                                  if args.max_descriptors\
                                                  else 0,
                                                  return_index_list=True)

        # per descriptor labels:
        if len(index_list)-1 != len(labels):
            raise ValueError('{} != {} + 1'.format(len(index_list),
                                                   len(labels)))
        le = preprocessing.LabelEncoder()
        labels = le.fit_transform(labels)
        desc_labels = np.zeros( len(cur_data), dtype=np.uint32)
        for r in xrange(len(labels)):
            desc_labels[index_list[r]:index_list[r+1]] = labels[r]

        print 'loaded all', cur_data.shape
        if 'transform' in args.mode and args.mode != 'fit_transform':
            print 'first feature before:', cur_data[0]
            print 'dimension before:', cur_data.shape[1], cur_data.dtype
            cur_data = prep.transform(cur_data)
            print 'first feature after:', cur_data[0]
            print 'dimension after:', cur_data.shape[1], cur_data.dtype

        if 'fit' in args.mode:
            if 'transform' in args.mode and args.strip_aug:
                prep.strip_aug = False
            prep.fit(cur_data, labels=desc_labels)

            if args.mode == 'fit_transform':
                cur_data = prep.transform(cur_data)

    else:
        progress = progressbar.ProgressBar(widgets=widgets,
                                       maxval=len(files))

        if any(isinstance(f, tuple) for f in files):
            files1 = [f for f in zip(*files)[0]]
            cp = os.path.commonprefix(files1)
        else:
            cp = os.path.commonprefix(files)

        def proj(i):
            # n_samples x n_features
            if not isinstance(args.inputfolder, basestring) and \
               len(args.inputfolder) > 1 or args.inputfolders_suffix != '':
                cur_data = pc.loadMultipleDescriptors(files[i])
                if i == 0:
                    print 'loaded descs of', files[i]
                    print 'shape:', cur_data.shape
            else:
               cur_data = pc.loadDescriptors(files[i])

            if args.mode == 'fit':
                prep.partial_fit(cur_data)
                progress.update(i+1)
                return

            else:
                if i == 0:
                    print 'before:'
                    print cur_data[0]
                    print cur_data.shape, cur_data.dtype

                cur_data = prep.transform(cur_data)

                if i == 0:
                    print 'after:'
                    print cur_data[0,0:min(128,cur_data.shape[1])]
                    print cur_data.shape, cur_data.dtype

            fname = files[i] if isinstance(files[i], basestring)\
                    else files[i][0]

            if os.path.isdir(cp):
                fname = os.path.relpath(fname, cp)

            if fname.endswith('.pkl.gz'):
                name = fname.replace('.pkl.gz','')
            else:
                name = os.path.splitext(fname)[0]

            if os.path.isdir(cp):
                pc.mkdir_p(os.path.join(args.outputfolder,
                    os.path.dirname(name)), silent=True)

            name = os.path.join(args.outputfolder, name  + '_pr.pkl.gz')
#            print fname, '-->', name
            with gzip.open(name, 'wb') as F:
                cPickle.dump(cur_data, F, -1)
            progress.update(i+1)

        progress.start()
        # FIXME: np.dot (e.g. used for (R)PCA) doesnt work in parallel atm
#        if args.parallel:
#            pc.parmap(proj, range(len(files)), args.nprocs)
#        else:
        map(proj, range(len(files)))
        progress.finish()

    prep.save_trafos(args.outputfolder)
예제 #6
0
                        'part, not the kmeans-initialization part)')
    parser.add_argument('--update', default='wmc',\
                        help='what to update w. GMM, w:weights, m:means, c:covars')
    parser.add_argument('--covar_type', default='diag',
                        choices=['full','diag'],
                        help='covariance type for gmm')
    return parser

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Clustering - Create vocabulary")
    parser = pc.commonArguments(parser)
    parser = parserArguments(parser)
    args = parser.parse_args()

    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files,_ = pc.getFiles(args.inputfolder, args.suffix, args.labelfile,
                        exact=True)
    if not files or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    # load features to train a universal background gmm
    print 'load features for training ubm from {} files'.format(len(files))

    descriptors = pc.loadDescriptors(files,\
                                     max_descs=args.max_descriptors[0],
                                     max_descs_per_file=max(int(args.max_descriptors[0]/len(files)),\
                                                            1), 
                                     rand=True, \
예제 #7
0
def run(args, prep=None):
    if prep == None:
        prep = preprocess.Preprocess()
    if not os.path.exists(args.outputfolder):
        pc.mkdir_p(args.outputfolder)

    files, labels = pc.getFiles(args.inputfolder,
                                args.suffix,
                                args.labelfile,
                                exact=args.exact,
                                max_files=args.max_files)
    if files is None or len(files) == 0:
        print 'getFiles() returned no images'
        sys.exit(1)

    maskfiles = pc.getMaskFiles(files, args.suffix, args.maskfolder,
                                args.masksuffix)
    if len(args.max_descriptors) == 0:
        descriptors, rand_indices = pc.loadDescriptors(
            files, rand=True, return_random_indices=True)
    else:
        max_descs_per_file = int(args.max_descriptors[0] / float(len(files)))
        max_descs_per_file = max(max_descs_per_file, 1)
        descriptors, rand_indices = pc.loadDescriptors(files,\
                                                        max_descs=args.max_descriptors[0],
                                                        max_descs_per_file=max_descs_per_file,
                                                        rand=True,
                                                        maskfiles=maskfiles,
                                                        return_random_indices=True)

    print 'got {} features'.format(len(descriptors))
    print 'features.shape', descriptors.shape

    # load features to train a universal background gmm
    print 'load features for training ubm from {} files'.format(len(files))

    if args.method == 'posteriors':
        posteriors_files, _ = pc.getFiles(args.posteriors_dir,
                                          args.posteriors_suffix,
                                          labelfile=args.labelfile,
                                          exact=args.exact,
                                          max_files=args.max_files)
        assert (len(posteriors_files) == len(files))
        indices = []

        widgets = [
            progressbar.Percentage(), ' ',
            progressbar.Bar(), ' ',
            progressbar.ETA()
        ]
        progress = progressbar.ProgressBar(widgets=widgets,
                                           maxval=len(posteriors_files))
        progress.start()
        for e, f in enumerate(posteriors_files):
            posteriors = pc.loadDescriptors(f)
            posteriors = posteriors[rand_indices[e]]
            cluster_idx = posteriors.argmax(axis=1)
            indices.append(cluster_idx)
            progress.update(e + 1)
        progress.finish()

        indices = np.concatenate(indices)
        assert (len(indices) == len(descriptors))
        means = recomputeMeans(descriptors, indices)
        vocabulary = cluster.KMeans(means.shape[0])  # dummy
        vocabulary.means_ = means
        vocabulary.type_ = 'kmeans'
    else:
        vocabulary = computeVocabulary(descriptors, args.method,
                                       args.num_clusters, args.iterations,
                                       args.gmm_update, args.lib,
                                       args.covar_type, args.nprocs)

    # TODO: rewrite to be more generic
    if 'sparse' in args.method and 'gmm' in args.method:
        gmm = mixture.GMM(args.num_clusters,
                          n_iter=args.iterations,
                          params=args.gmm_update,
                          init_params='wc')
        gmm.means_ = vocabulary.reshape(args.num_clusters, -1)
        gmm.fit(descriptors)
        vocabulary = gmm

    if args.predict:
        pred = vocabulary.predict(descriptors)
        pred_prob = None
        if 'predict_proba' in dir(vocabulary):
            pred_prob = vocabulary.predict_proba(descriptors)
        for i, f in enumerate(files):
            if pred_prob:
                print '{}\t[{}], ([{}])'.format(os.path.basename(f), pred[i],
                                                pred_prob[i])
            else:
                print '{}\t[{}]'.format(os.path.basename(f), pred[i])

    # save gmm
    voc_filepath = os.path.join(
        args.outputfolder,
        (args.vocabulary_filename
         if args.vocabulary_filename != None else args.method) + 'pkl.gz')
    with gzip.open(voc_filepath, 'wb') as f:
        cPickle.dump(vocabulary, f, -1)
    print 'saved vocabulary at', voc_filepath

    if args.method == 'gmm':
        try:
            aic = vocabulary.aic(descriptors)
            print 'aic:', aic
            with open(os.path.join(args.outputfolder, 'aic.txt'), 'a') as f:
                f.write('{}\n'.format(aic))
        except:
            raise


#            print('couldnt compute aic, error: {}'.format(e))

    return os.path.abspath(voc_filepath)