def train_model_agg( self, bags, y, cv_split_bags=None, sample_weight=None, param_search=True ): """Train instance aggregation function using quantile function.""" # figure out number of quantiles and where to set them ninst = int( np.round( sum( [ len(bag) for bag in bags ] ) / float(len(bags)) ) ) if self.quantiles is not None: nq = self.quantiles else: nq = 16 if ninst <= nq: quantiles = np.linspace(0,100,ninst) else: quantiles = np.linspace(100.0/nq/2,100-100.0/nq/2,nq) p = [] test_y = [] if cv_split_bags is None: # train/test split skf = sklearn.model_selection.StratifiedKFold( n_splits=5, shuffle=True ) cv_split_bags = list(skf.split(bags,y)) # compute quantile function for f in range(5): train_idx,test_idx = cv_split_bags[f] for i in test_idx: pi = super(SIL,self).predict( bags[i], cv=f ) if pi.shape[1] == 2: q = np.percentile( pi[:,1], quantiles ) else: q = np.hstack( [ np.percentile( pi[:,c], quantiles ) for c in range(pi.shape[1]) ] ) p.append( q ) test_y.append( y[i] ) p = np.vstack(p) test_y = np.array(test_y) # train model model_agg = LinearClassifier( classifier='svm' ) self.C_agg,self.gamma_agg = model_agg.param_search( p, test_y, sample_weight=sample_weight, quick=False ) model_agg.C = self.C_agg model_agg.fit( p, test_y, sample_weight=sample_weight, param_search=param_search, calibrate=self._calibrate ) self._model_agg = (model_agg,quantiles)
def param_search(self, bags, y, instances, classes, quick=True, C=1.0, gamma=1.0, bag_inst_idx=None, sample_weight=None, inst_search=False): """Search for best hyperparameters.""" if bag_inst_idx is None: bag_inst_idx = [ [i]*len(b) for i,b in enumerate(bags) ] if C is None: # figure out an inital set of hyperparameters using the mean of all instances from each bag td = np.array([ t.mean(axis=0) for t in bags ]) tl = np.array(y) # compute mean and std dev mu = td.mean(axis=0) sigma = td.std(axis=0) + 1e-3 td = ( td - mu ) / sigma model = LinearClassifier( classifier=self.classifier, kernel=self.kernel, n_jobs=self.n_jobs ) C,gamma = model.param_search( td, tl ) acc = {} bestacc = 0 bestg = None bestC = None while True: # start with values given and search in neighborhood; search will continue if best value falls on edge of neighborhood Cvals = [ float(2**e)*C for e in range(-2,3) ] if self.kernel == 'rbf': gvals = [ float(2**e)*gamma for e in range(-2,3) ] else: gvals = [1.0] # get instance indices for each bag idx = [] i = 0 for yi,inst in zip(y,bags): idx.append( np.arange(i,i+len(inst)) ) i += len(inst) if self.kernel == 'rbf': Cvals2 = [ C for C in Cvals ] else: Cvals2 = [ C for C in Cvals if (C,1.0) not in acc.keys() ] folds = 5 # grid search if inst_search: # find best instance-level classifier model = LinearClassifier( classifier=self.classifier, kernel=self.kernel, p=self.p, n_jobs=self.n_jobs ) bestC,bestg = model.param_search( instances, classes, quick, C=C, gamma=gamma, sample_weight=sample_weight ) bestacc = 0 else: # find best result at bag level skf = sklearn.model_selection.StratifiedKFold( n_splits=folds, shuffle=True ) labels = [ (y[i],np.array([classes[j] for j in idx[i]])) for i in range(len(y)) ] est = SIL( classifier=self.classifier, kernel=self.kernel, predict_type=self.predict_type, class_weight=self.class_weight, p=self.p, subset=self.subset, quantiles=self.quantiles, metric=self.metric ) gridcv = sklearn.model_selection.GridSearchCV( est, [{'C':Cvals2,'gamma':gvals}], cv=skf, n_jobs=self.n_jobs, refit=False ) gridcv.fit( bags, y, sample_weight=sample_weight, calibrate=self._calibrate ) for mean_score,params in zip(gridcv.cv_results_['mean_test_score'],gridcv.cv_results_['params']): acc[params['C'],params['gamma']] = mean_score if gridcv.best_score_ > bestacc: bestC = gridcv.best_params_['C'] bestg = gridcv.best_params_['gamma'] bestacc = gridcv.best_score_ if bestC == Cvals[0] or bestC == Cvals[-1] or ( self.kernel == 'rbf' and ( bestg == gvals[0] or bestg == gvals[-1] ) ): C = bestC gamma = bestg else: break self._model = None self._model_agg = None return bestC,bestg