Пример #1
0
 def report(self,processed,total):
     if processed % self.step != 0:
         return
     elapsed = self.timer.total()
     if processed == 0:
         eta = 0.0
     else:
         eta = elapsed * (total - processed) / processed
     if self.rootOnly:
         mpi.rootprint('{} {}/{}, elapsed {}, eta {}.'.format(self.header, processed, total, hms(elapsed), hms(eta)))
     else:
         mpi.nodeprint('{} {}/{}, elapsed {}, eta {}.'.format(self.header, processed, total, hms(elapsed), hms(eta)))
Пример #2
0
 def normalize_data(self, m = None, std = None, sabotage = False):
     if self.normalized:
         mpi.rootprint('Warning: you are re-normalizing.')
     if m is None or std is None:
         # if either is none, we recompute.
         for i in range(self.nCodeLocal):
             for j in range(self.nMetabins):
                 self.compute_feature(i,j,normalize=False)
                 self.mLocal[i,j] = np.mean(self.featBuffer)
                 self.stdLocal[i,j] = np.std(self.featBuffer)+1e-8
                 if sabotage:
                     self.mLocal[i,j] *= 0.
                     self.stdLocal[i,j] *= 0.
                     self.stdLocal[i,j] += 1.
     else:
         self.mLocal[:] = m
         self.stdLocal[:] = std
     self.normalized = True
Пример #3
0
                    help='whether to read local cache or not')
parser.add_argument('-c',
                    '--nClass',
                    type=int,
                    default=10,
                    help='number of classes')
parser.add_argument('-t',
                    '--random_iterations',
                    type=int,
                    default=1,
                    help='number of random iterations')
parser.add_argument('-s',
                    '--skip_normalization',
                    default=False,
                    action='store_true')
mpi.rootprint(str(sys.argv))
args = parser.parse_args(sys.argv[1:])

# cifar specifications
data_file = 'cifar_tr_{}_{}.mat'
label_file = 'tr_label.mat'
test_data_file = 'cifar_te_{}_{}.mat'
test_label_file = 'te_label.mat'
nTraining = 50000
nTesting = 10000

grafter = grafting_mb.GrafterMPI()
tester = grafting_mb.GrafterMPI()

grafter.init_specs(nTraining, args.nBinsPerEdge, args.nCodes, args.nClass,
                   args.maxSelFeat, args.gamma, np.float64)
Пример #4
0
rank = MPI.COMM_WORLD.Get_rank()

parser = argparse.ArgumentParser(description="Script to test cifar with existing trained dump.", \
                                 epilog="Yangqing Jia at NECLA, 2011")
parser.add_argument('-r', '--data_root', default='.', help='the dataset path')
parser.add_argument('-n', '--nBinsPerEdge', type=int, default=0, help='the number of bins per edge')
parser.add_argument('-d', '--nCodes', type=int, default=0, help='the number of codes')
parser.add_argument('-b', '--batch_size', type=int, default=1000, help='the batch size that the data is stored')
parser.add_argument('-m', '--maxSelFeat', type=int, default=6400, help='max number of selected features')
parser.add_argument('-g', '--gamma', type=float, default=0.01, help='regularization term for classification')
parser.add_argument('-e', '--local_cache_root', default=None, help='local cache root')
parser.add_argument('-l', '--read_local_cache', type=int, default=0, help='whether to read local cache or not')
parser.add_argument('-c', '--nClass', type=int, default=10, help='number of classes')
parser.add_argument('-t', '--random_iterations', type=int, default=1, help='number of random iterations')
parser.add_argument('-s', '--skip_normalization',  default=False, action = 'store_true')
mpi.rootprint(str(sys.argv))
args = parser.parse_args(sys.argv[1:])

# cifar specifications
data_file = 'cifar_tr_{}_{}.mat'
label_file = 'tr_label.mat'
test_data_file = 'cifar_te_{}_{}.mat'
test_label_file = 'te_label.mat'
nTraining = 50000
nTesting = 10000

grafter = grafting_mb.GrafterMPI()
tester = grafting_mb.GrafterMPI()

grafter.init_specs(nTraining, args.nBinsPerEdge, args.nCodes, args.nClass, args.maxSelFeat, args.gamma, np.float64)
tester.init_specs(nTesting, args.nBinsPerEdge, args.nCodes, args.nClass, args.maxSelFeat, args.gamma, np.float64)
Пример #5
0
    def graft(self, dump_every = 0, \
              dump_file = None, \
              nActiveSet = None, \
              tester = None, \
              test_every = 10, \
              samplePerRun = 1, \
              fromDumpFile = None \
             ):
        '''
        the main grafting algorithm
        ==Parameters==
        dump_every: the frequency to dump the current result. 0 if you do not want to dump
        dump_file: dump file name.
        nActiveSet: when retraining, the number of features in the active set.
            pass None for full retraining (may be slow!)
            pass a positive number to select the last features
            pass a negative number to select features via their gradient values
                (recommended, much better than other approaches)
            pass 0 for boosting
        tester: the grafterMPI class that hosts the test data
        test_every: the frequency to compute test accuracy
        samplePerRun: in each feature selection run, how many features (in proportions)
            we should sample to select feature from. Pass 1 to enumerate all features.
        fromDumpFile: restore from dump file (not implemented for the mb version yet)
        '''
        self.comm.barrier()
        mpi.rootprint('*'*38)
        mpi.rootprint('*'*15+'grafting'+'*'*15)
        mpi.rootprint('*'*38)

        if True:
            mpi.rootprint('Number of data: {}'.format(self.nData))
            mpi.rootprint('Number of labels: {}'.format(self.nLabel))
            mpi.rootprint('Number of codes: {}'.format(self.nCodes))
            mpi.rootprint('Bins: {0}x{0}'.format(self.nBinsPerEdge))
            mpi.rootprint('Total pooling areas: {}'.format(self.nMetabins))
            mpi.rootprint('Total features: {}'.format(self.nMetabins*self.nCodes))
            mpi.rootprint('Number of features to select: {}'.format(self.maxGraftDim))
        mpi.rootprint('Graft Settings:')
        mpi.rootprint('dump_every = {}\nnActiveSet={}\ntest_every={}\nsamplePerRun={}'.format(\
                            dump_every, nActiveSet, test_every, samplePerRun))
        self.comm.barrier()

        if tester is not None:
            # normalize the test data with the stats of the training data
            tester.normalize_data(self.mLocal, self.stdLocal)
        if fromDumpFile is not None:
            self.restore_from_dump_file(fromDumpFile, tester)

        old_loss = 1e10
        timer = Timer()
        itertimer = Timer()
        for T in range(self.nSelFeats, self.maxGraftDim):
            itertimer.reset()
            mpi.rootprint('*'*15+'Round {}'.format(T)+'*'*15)
            score, codeid, metabinid = self.select_new_feature_by_grad(samplePerRun)
            mpi.rootprint('Selected Feature [code: {}, metabin: {}], score {}'.format(codeid, metabinid, score))
            # add this feature to the selected features
            self.append_feature(codeid, metabinid)
            mpi.rootprint('Number of Features: {}'.format(self.nSelFeats))
            mpi.rootprint('Feature selection took {} secs'.format(itertimer.lap()))
            mpi.rootprint('Retraining the model...')
            loss = self.retrain_model(nActiveSet, samplePerRun)
            mpi.rootprint('Total loss reduction {}/{}={}'.format(loss, old_loss, loss/old_loss))
            mpi.rootprint('Current training accuracy: {}'.format(self.compute_current_accuracy()))
            mpi.rootprint('Model retraining took {} secs'.format(itertimer.lap()))
            old_loss = loss

            if tester is not None:
                tester.append_feature(codeid, metabinid)
                if (T+1) % test_every == 0:
                    # print test accuracy
                    test_accuracy = tester.compute_test_accuracy(self.weights, self.b)
                    mpi.rootprint('Current Testing accuracy: {}'.format(test_accuracy))

            self.safebarrier()
            mpi.rootprint('This round took {} secs, total {} secs'.format(timer.lap(), timer.total()))
            mpi.rootprint('ETA {} secs.'.format(timer.total() * (self.maxGraftDim-T)/(T+1.0e-5)))

            if dump_every > 0 and (T+1) % dump_every == 0 and dump_file is not None:
                mpi.rootprint('*'*15 + 'Dumping' + '*'*15)
                self.dump_current_state(dump_file + str(T)+'.mat')

        mpi.rootprint('*'*15+'Finalizing'.format(T)+'*'*15)
        if dump_file is not None:
            self.dump_current_state(dump_file + 'final.mat')
Пример #6
0
    def randomselecttest(self, tester=None, random_iterations=1, should_normalize = True):
        '''
        test the performance of random selection
        modified by Ian Goodfellow to use seeded random number generation so
        that results are replicable
        '''
        self.comm.barrier()
        mpi.rootprint('*'*46)
        mpi.rootprint('*'*15+'random selection'+'*'*15)
        mpi.rootprint('*'*46)

        trainaccu = np.zeros(random_iterations)
        testaccu = np.zeros(random_iterations)

        rng = np.random.RandomState([1,2,3])

        if tester is not None:
            # normalize the test data with the stats of the training data
            tester.normalize_data(self.mLocal, self.stdLocal, sabotage = not should_normalize)

        itertimer = Timer()
        for iter in range(random_iterations):
            itertimer.reset()
            mpi.rootprint('*'*15+'Round {}'.format(iter)+'*'*15)
            if self.rank == 0:
                #decide which features we are going to select
                allidx = np.array(range(self.nCodes*self.nMetabins),dtype=np.int)
                rng.shuffle(allidx)
                codeidlist = allidx / self.nMetabins
                metabinidlist = allidx % self.nMetabins
            else:
                codeidlist = None
                metabinidlist = None
            codeidlist = self.comm.bcast(codeidlist, root=0)
            metabinidlist = self.comm.bcast(metabinidlist, root=0)

            self.append_multiple_features(codeidlist[:self.maxGraftDim], metabinidlist[:self.maxGraftDim])
            mpi.rootprint('Feature selection took {} secs'.format(itertimer.lap()))
            mpi.rootprint('Training...')
            loss = self.retrain_model(None)
            trainaccu[iter] = self.compute_current_accuracy()
            mpi.rootprint('Training took {} secs'.format(itertimer.lap()))
            mpi.rootprint('Current training accuracy: {}'.format(trainaccu[iter]))
            if tester is not None:
                tester.append_multiple_features(codeidlist[:self.maxGraftDim], metabinidlist[:self.maxGraftDim])
                testaccu[iter] = tester.compute_test_accuracy(self.weights, self.b)
                mpi.rootprint('Current Testing accuracy: {}'.format(testaccu[iter]))
            mpi.rootprint('Testing selection took {} secs'.format(itertimer.lap()))
        self.safebarrier()

        mpi.rootprint('*'*15+'Summary'+'*'*15)
        mpi.rootprint('Training accuracy: {} +- {}'.format(np.mean(trainaccu),np.std(trainaccu)))
        mpi.rootprint('Testing accuracy: {} +- {}'.format(np.mean(testaccu),np.std(testaccu)))
Пример #7
0
    def train_whole_model(self, tester=None):
        '''
        test the performance using all the features
        may be memory consuming.
        '''
        self.comm.barrier()
        mpi.rootprint('*'*46)
        mpi.rootprint('*'*15+'whole featureset'+'*'*15)
        mpi.rootprint('*'*46)

        if tester is not None:
            # normalize the test data with the stats of the training data
            tester.normalize_data(self.mLocal, self.stdLocal)

        timer = Timer()
        timer.reset()
        if self.maxGraftDim != self.nMetabins*self.nCodes:
            mpi.rootprint('Please initialize with maxGraftDim=nMetabins*nCodes')
            return
        self.nSelFeats = 0
        self.isSelected[:] = False
        mpi.rootprint('Generating Features...')
        for code in range(self.nCodes):
            for metabin in range(self.nMetabins):
                self.append_feature(code, metabin)
                if tester is not None:
                    tester.append_feature(code, metabin)
        mpi.rootprint('Feature generation took {} secs'.format(timer.lap()))
        mpi.rootprint('Training...')
        loss = self.retrain_model(None)
        mpi.rootprint('Training took {} secs'.format(timer.lap()))
        mpi.rootprint('Training accuracy: {}'.format(self.compute_current_accuracy()))
        if tester is not None:
            mpi.rootprint('Current Testing accuracy: {}'.format(tester.compute_test_accuracy(self.weights, self.b)))
Пример #8
0
 def load_data_batch(self, root, batch_size, file_template, labelfile, \
                     rootRead = True, isTest = False, \
                     local_cache_root = None, read_local_cache = False, should_normalize = True):
     '''
     load the data in batches. file_template should be 'filename_{}_{}.mat'
     where the batch size and batch id will be filled. The mat file will
     contain a variable called 'feat'. labelfile is the file for labels
     starting from either 0 or 1 (our code converts the labels to 0 ~ nLabel-1).
     '''
     from scipy import io
     nBatches = int(np.ceil(float(self.nData) / batch_size))
     # deal both cases: batch starts with 0 or 1
     if os.path.exists(os.path.join(root,file_template.format(batch_size, 0))):
         allrange = range(nBatches)
     else:
         allrange = range(1,nBatches+1)
     if local_cache_root is not None and not os.path.exists(local_cache_root):
         try:
             os.makedirs(local_cache_root)
         except OSError:
             mpi.nodeprint('Warning: I cannot create the directory necessary.')
     if read_local_cache and local_cache_root is not None:
         # load from local cache
         sid = 0
         for bid in allrange:
             mpi.rootprint('From Local Cache: Loading batch {} of {}'.format(bid, nBatches))
             filename = os.path.join(local_cache_root, file_template.format(batch_size, bid))
             matdata = io.loadmat(filename)
             batchNdata = matdata['feat'].shape[2]
             self.featSlice[:,:,sid:sid+batchNdata] = matdata['feat']
             sid += batchNdata
     elif rootRead:
         # root reads the file, and then propagates the values to other machines
         dataid = 0 # current feature id
         dataBuffer = np.zeros(self.nBaseFeat, dtype = self.dtype)
         timer = Timer()
         for bid in allrange:
             mpi.rootprint('RootRead: Loading batch {} of {}'.format(bid, nBatches))
             if self.rank == 0:
                 # read only of I am root
                 filename = os.path.join(root, file_template.format(batch_size, bid))
                 print filename
                 matdata = io.loadmat(filename)
                 feat = matdata['feat']
                 batchNdata = feat.shape[0]
             else:
                 feat = None
                 batchNdata = 0
             # broadcast the features
             # it seems that doing this one-datum-by-one-datum is the fastest...
             batchNdata = self.comm.bcast(batchNdata, root=0)
             for batchfeatid in range(batchNdata):
                 if self.rank == 0:
                     dataBuffer[:] = feat[batchfeatid]
                 self.comm.Bcast(dataBuffer, root = 0)
                 # the data storage is like
                 # [bin1_code1 bin1_code2 ... bin1_codeK bin2_code1 ... binN_codeK]
                 # while our data is [nCodeLocal, nBins, nData]
                 self.featSlice[:,:,dataid] = \
                     dataBuffer.reshape(self.nBins, self.nCodes)[:,self.codeRange[0]:self.codeRange[1]].T
                 dataid += 1
             if local_cache_root is not None:
                 # write local cache, so we may read it back later
                 filename = os.path.join(local_cache_root, file_template.format(batch_size, bid))
                 try:
                     io.savemat(filename,{'feat': self.featSlice[:,:, dataid-batchNdata:dataid]}, oned_as='row')
                 except Exception, e:
                     mpi.nodeprint('Unable to save to local buffer {}'.format(filename))
             mpi.rootprint('Elapsed {} secs.'.format(timer.lap()))
Пример #9
0
 def init_specs(self, nData, nBinsPerEdge, nCodes, nLabel, maxGraftDim, gamma, dtype, \
                metabinGenerator = bd.rectangularBins):
     '''
     Initialize the specs. Specifically, the raw data (for the base bins) is
     a nBinsPerEdge^2 * nCodes * nData cube, and each node will host a subset
     of the codes (all bins for any single code will be hosted on the same node).
     ==Parameters==
     nData: number of data points.
     nBinsPerEdge: number of base bins per edge. For example, for 4x4 base bins,
         pass 4.
     nCodes: the codebook size.
     nLabel: number of labels.
     maxGraftDim: the maximum number of features to select.
     gamma: regularizer for the classifier.
     dtype: data type. Only np.float64 is supported now as we have some c-code
         that has double-precision version only.
     metabinGenerator: the function to generate metabins. See bindef.py
     '''
     # determine feature range and data range
     if nData < self.size or nCodes < self.size:
         print 'Seriously? Is the problem really large scale?'
         # I know it's unethical, but whatever
         exit()
     self.nData = nData
     self.nCodes = nCodes
     self.nBinsPerEdge = nBinsPerEdge
     self.nBins = nBinsPerEdge*nBinsPerEdge
     self.nBaseFeat = self.nCodes*self.nBins
     self.metabins = metabinGenerator(nBinsPerEdge)
     self.nMetabins = self.metabins.shape[0]
     self.nLabel = nLabel
     if maxGraftDim > self.nMetabins*self.nCodes:
         mpi.rootprint('Warning: maxGraftDim should be no more than the number of available features.')
         maxGraftDim = self.nMetabins*self.nCodes
     self.maxGraftDim = maxGraftDim
     self.gamma = gamma
     self.dtype = dtype
     self.ncode_per_node = int(np.ceil( float(nCodes) / self.size ))
     self.codeRange = [self.ncode_per_node*self.rank, min(self.ncode_per_node*(self.rank+1), nCodes)]
     self.nCodeLocal = int(self.codeRange[1] - self.codeRange[0])
     self.mLocal = np.zeros((self.nCodeLocal, self.nMetabins), dtype=self.dtype)
     self.stdLocal = np.zeros((self.nCodeLocal, self.nMetabins), dtype=self.dtype)
     self.normalized = False
     # pre-allocate data space
     self.featSlice = np.zeros([self.nCodeLocal, self.nBins, self.nData], dtype=self.dtype)
     self.labels = -np.ones([self.nLabel, self.nData], dtype=self.dtype)
     self.rawlabels = np.zeros(self.nData, dtype=np.int)
     # pre-allocate selected features cache
     if self.rank < self.nLabel:
         self.dataSel = np.zeros([self.maxGraftDim, self.nData], dtype=self.dtype) # selected features
     else:
         self.dataSel = None
     # pre-allocate classifier parameters
     self.weights = np.zeros([self.nLabel, self.maxGraftDim], dtype=self.dtype) # weights
     self.b = np.zeros(self.nLabel, dtype=self.dtype) # bias
     self.curr_wxb = np.zeros([self.nLabel,self.nData], dtype=self.dtype) # current prediction
     # pre-allocate feature selection statistics
     self.nSelFeats = 0 # number of selected features
     self.selCodeID = np.zeros(self.maxGraftDim, dtype=np.int)
     self.selMetabinID = np.zeros(self.maxGraftDim, dtype=np.int)
     self.isSelected = np.zeros((self.nCodes, self.nMetabins),dtype=np.bool) # 0-1 array to define if a feature is selected
     # pre-allocate mpi buffer here
     self.featBuffer = np.zeros(self.nData, dtype = self.dtype)
     self.featBufferPerCode = np.zeros((self.nMetabins, self.nData), dtype=self.dtype)
     # other buffers
     self.localGradMat = np.zeros((self.nCodeLocal, self.nMetabins, self.nLabel), dtype = self.dtype)
     self.scoreVec = np.zeros((self.nCodeLocal, self.nMetabins),dtype=self.dtype) # the buffer to store local gradients for feature selection
Пример #10
0
                # this sometimes helps python do garbage collection
                matdata = None
        # load label
        if self.rank == 0:
            matdata = io.loadmat(os.path.join(root, labelfile))
            # if the label starts with 1, make it start with 0
            if matdata['label'].min() == 1:
                matdata['label'] -= 1
            self.rawlabels[:] = matdata['label'].reshape(matdata['label'].size)[:self.nData]
            matdata = None
        self.comm.Bcast(self.rawlabels, root=0)
        for i in range(self.nData):
            # we need to make the label matrix a -1/1 matrix
            self.labels[self.rawlabels[i],i] = 1
        if not isTest:
            mpi.rootprint('Normalizing training data')
            timer = Timer()
            self.normalize_data(sabotage = not should_normalize)
            mpi.nodeprint('Normalization took {} secs.'.format(timer.lap()))

    def append_feature(self,codeid, metabinid):
        '''
        find the owner of the feature, broadcast it to all the nodes, and append the
        feature to the currently selected features if necessary.
        from the owner of the feature, broadcast this feature and append it to the
        current selected features. Each instance will update the slice of data it
        is responsible for
        '''
        # find the owner
        owner = int( codeid / self.ncode_per_node )
        if self.rank == owner: