Пример #1
0
    def train_model_agg( self, bags, y, cv_split_bags=None, sample_weight=None, param_search=True ):
        """Train instance aggregation function using quantile function."""

        # figure out number of quantiles and where to set them
        ninst = int( np.round( sum( [ len(bag) for bag in bags ] ) / float(len(bags)) ) )
        if self.quantiles is not None:
            nq = self.quantiles
        else:
            nq = 16
        if ninst <= nq:
            quantiles = np.linspace(0,100,ninst)
        else:
            quantiles = np.linspace(100.0/nq/2,100-100.0/nq/2,nq)

        p = []
        test_y = []
        if cv_split_bags is None:
            # train/test split
            skf = sklearn.model_selection.StratifiedKFold( n_splits=5, shuffle=True )
            cv_split_bags = list(skf.split(bags,y))

        # compute quantile function
        for f in range(5):
            train_idx,test_idx = cv_split_bags[f]
            for i in test_idx:
                pi = super(SIL,self).predict( bags[i], cv=f )
                if pi.shape[1] == 2:
                    q = np.percentile( pi[:,1], quantiles )
                else:
                    q = np.hstack( [ np.percentile( pi[:,c], quantiles ) for c in range(pi.shape[1]) ] )
                p.append( q )
                test_y.append( y[i] )
        p = np.vstack(p)
        test_y = np.array(test_y)

        # train model
        model_agg = LinearClassifier( classifier='svm' )
        self.C_agg,self.gamma_agg = model_agg.param_search( p, test_y, sample_weight=sample_weight, quick=False )
        model_agg.C = self.C_agg
        model_agg.fit( p, test_y, sample_weight=sample_weight, param_search=param_search, calibrate=self._calibrate )
        self._model_agg = (model_agg,quantiles)
Пример #2
0
    def param_search(self, bags, y, instances, classes, quick=True, C=1.0, gamma=1.0, bag_inst_idx=None, sample_weight=None, inst_search=False):
        """Search for best hyperparameters."""

        if bag_inst_idx is None:
            bag_inst_idx = [ [i]*len(b) for i,b in enumerate(bags) ]

        if C is None:
            # figure out an inital set of hyperparameters using the mean of all instances from each bag
            td = np.array([ t.mean(axis=0) for t in bags ])
            tl = np.array(y)

            # compute mean and std dev
            mu = td.mean(axis=0)
            sigma = td.std(axis=0) + 1e-3
            td = ( td - mu ) / sigma

            model = LinearClassifier( classifier=self.classifier, kernel=self.kernel, n_jobs=self.n_jobs )
            C,gamma = model.param_search( td, tl )

        acc = {}
        bestacc = 0
        bestg = None
        bestC = None
        while True:

            # start with values given and search in neighborhood; search will continue if best value falls on edge of neighborhood
            Cvals = [ float(2**e)*C for e in range(-2,3) ]
            if self.kernel == 'rbf':
                gvals = [ float(2**e)*gamma for e in range(-2,3) ]
            else:
                gvals = [1.0]

            # get instance indices for each bag
            idx = []
            i = 0
            for yi,inst in zip(y,bags):
                idx.append( np.arange(i,i+len(inst)) )
                i += len(inst)

            if self.kernel == 'rbf':
                Cvals2 = [ C for C in Cvals ]
            else:
                Cvals2 = [ C for C in Cvals if (C,1.0) not in acc.keys() ]

            folds = 5

            # grid search
            if inst_search:
                # find best instance-level classifier
                model = LinearClassifier( classifier=self.classifier, kernel=self.kernel, p=self.p, n_jobs=self.n_jobs )
                bestC,bestg = model.param_search( instances, classes, quick, C=C, gamma=gamma, sample_weight=sample_weight )
                bestacc = 0
            else:
                # find best result at bag level
                skf = sklearn.model_selection.StratifiedKFold( n_splits=folds, shuffle=True )
                labels = [ (y[i],np.array([classes[j] for j in idx[i]])) for i in range(len(y)) ]
                est = SIL( classifier=self.classifier, kernel=self.kernel, predict_type=self.predict_type, class_weight=self.class_weight, p=self.p, subset=self.subset, quantiles=self.quantiles, metric=self.metric )
                gridcv = sklearn.model_selection.GridSearchCV( est, [{'C':Cvals2,'gamma':gvals}], cv=skf, n_jobs=self.n_jobs, refit=False )
                gridcv.fit( bags, y, sample_weight=sample_weight, calibrate=self._calibrate )
                for mean_score,params in zip(gridcv.cv_results_['mean_test_score'],gridcv.cv_results_['params']):
                    acc[params['C'],params['gamma']] = mean_score
                if gridcv.best_score_ > bestacc:
                    bestC = gridcv.best_params_['C']
                    bestg = gridcv.best_params_['gamma']
                    bestacc = gridcv.best_score_

            if bestC == Cvals[0] or bestC == Cvals[-1] or ( self.kernel == 'rbf' and ( bestg == gvals[0] or bestg == gvals[-1] ) ):
                C = bestC
                gamma = bestg
            else:
                break

        self._model = None
        self._model_agg = None

        return bestC,bestg