def fit_sv_pandas(self, df_training, target_column, feature_columns, df_validation=None, ratio_training=None, **kwargs): """ `fit` for pandas DataFrame to perform single validation Args: df_training (pandas.DataFrame): training data set target_column (str): column name of prediction target feature_columns (list of str): column names of features df_validation (pandas.DataFrame): if specified, used as validation data set ratio_training (float): if specified, `df_training` is split for training / validation **kwargs: Other keyword arguments for original `fit` Returns: conjurer.ml.Model """ x, y, num_training, num_validation = _split_for_sv( df_training, target_column, feature_columns, df_validation, ratio_training) self.cv = model_selection.PredefinedSplit( numpy.array([-1] * num_training + [0] * num_validation)) logger.warning("start learning with {} hyper parameters".format( self.n_iter)) self.fit(x, y, **kwargs) return model.Model(self, feature_columns=feature_columns, target_column=target_column)
def gridsearch_CV_wrapper(params, model, Xtrain, Ytrain, Xtest, Ytest, validation_size=.1, n_jobs=1): ''' Wrapper around gridsearch ''' XvalidIndices, _ = modSel.train_test_split(list(range(0, len(Xtrain))), train_size=validation_size, stratify=Ytrain) Xvalid = np.full(len(Xtrain), -1) for index in XvalidIndices: Xvalid[index] = 0 predef_split = modSel.PredefinedSplit(Xvalid) scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score) gridSearcher = modSel.GridSearchCV(model, params, n_jobs=n_jobs, cv=predef_split, scoring=scorer) gridSearcher.fit(Xtrain, Ytrain) bestLogReg = gridSearcher.best_estimator_ bestLogReg.fit(Xtrain, Ytrain) score = bestLogReg.score(Xtest, Ytest) return Dict({"cv_results": gridSearcher.cv_results_, \ "predictor":bestLogReg, \ "best_params": gridSearcher.best_params_, \ "best_score":score, \ "testTuple": (Xtest, Ytest), \ "trainTuple": (Xtrain, Ytrain)})
def create(self, X, y): indices = np.arange(num_samples(X)) valid = indices[self.valid] train = np.setdiff1d(indices, valid) test_fold = np.ones(num_samples(X)) test_fold[train] = -1 return model_selection.PredefinedSplit(test_fold)
def __init__(self, k=3, test_folds=None): """Initializes the KFoldCrossValidation object. The dataset may be split into folds in two ways: automatically or manually. If automatic, the `k` argument is to be used. If manual, the user may specify the fold index for each sample in the dataset via the `test_folds` argument. :param k: the number of folds to uniformly split the data into when using the automatic approach (default 3). :type k: int, optional :param test_folds: an array specifying the fold index for each sample in the dataset when using the manual approach (default None). The entry test_folds[i] specifies the index of the test set that sample i belongs to. It is also possible to exclude sample i from any test set (i.e., include sample i in every training set) by setting test_folds[i] to -1. If `test_folds` is None, the automatic approach is assumed and only the `k` parameter is considered. Otherwise, the manual approach is assumed and only the `test_folds` parameter is considered. :type test_folds: `numpy.ndarray` or None, optional :raises InvalidParameterValueException: if a `k` parameter value less than 2 is used. """ if (k is not None) and (k < 2): raise InvalidParameterValueException( parameter="k", value=k, method="K-Fold Cross Validation (K-FCV)", is_algorithm=False, additional_msg="K-FCV requires at least two folds. " "Please choose a value of k=2 or higher.") desc = "K-Fold Cross Validation." # TODO: add longer description self._k = k self._test_folds = test_folds # self._objects = None # full objects (prior to fold-splitting) # self._ranks = None # full ranks (prior to fold-splitting) self._dual_format = None # self._folds = None # only used for manual splitting (via k_col) self._kf = None if self._test_folds is not None: # k_col is given self._k = None self._kf = model_selection.PredefinedSplit( self._test_folds) # k_codes else: # only k is given self._kf = model_selection.KFold(n_splits=self._k, random_state=0) # call base class constructor super().__init__(description=desc, name=EvaluatorType.KFCV.name, k=self._k, manual_test_folds=self._test_folds)
def test_train_validation(joinedDataframe, train_size=.8, validation_size=.1): ''' Does a test, train, and validation split, uses cross validation on the validation split and returns the results based on the test split :param joinedDataframe a dataframe with the binary classifier having column of "angus" :param train_validation_size how to split between train set and test set and validation set :return a dictionary of results for each param combination, the best LogisticRegression estimator, and the score on the test set ''' X = joinedDataframe.drop(["angus"], axis=1) Y = joinedDataframe["angus"] #https://stackoverflow.com/questions/34842405/parameter-stratify-from-method-train-test-split-scikit-learn Xtrain, Xtest, Ytrain, Ytest = modSel.train_test_split( X, Y, train_size=train_size + validation_size, stratify=Y) XvalidIndices, _ = modSel.train_test_split(list(range(0, len(Xtrain))), train_size=validation_size, stratify=Ytrain) Xvalid = np.full(len(Xtrain), -1) for index in XvalidIndices: Xvalid[index] = 0 #we set up parameter search space here to look up params = Dict() params.C = [.4, .8, 1.6, 3.2, 6.4] params.tol = [.01] # to make it easier to converge polyParams = Dict( params ) #copy to a new dict to avoid testing param sets that are the same params.kernel = ["linear", "rbf", "sigmoid"] params.gamma = ["auto", .001, .01, .1] polyParams.kernel = ["poly"] polyParams.degree = [1, 2, 3, 4, 5] polyParams.coef0 = [0, 1, 5, -1, -5] svm = sklearn.svm.SVC(cache_size=7000) predef_split = modSel.PredefinedSplit(Xvalid) gridSearcher = modSel.GridSearchCV(svm, [params, polyParams], n_jobs=10, cv=predef_split) gridSearcher.fit(Xtrain, Ytrain) bestPredictor = gridSearcher.best_estimator_ bestPredictor.fit(Xtrain, Ytrain) score = bestPredictor.score(Xtest, Ytest) return Dict({"cv_results": gridSearcher.cv_results_, \ "predictor": bestPredictor, \ "best_score":score, \ "testTuple": (Xtest, Ytest), \ "trainTuple": (Xtrain, Ytrain)})
def run_grid_search(X_train, y_train, X_dev, y_dev, param_grid, my_scorer): crf = sklearn_crfsuite.CRF( algorithm='ap', all_possible_transitions=True, all_possible_states=True, ) validation_set_indexes = [-1] * len(X_train) + [0] * len(X_dev) ps = ms.PredefinedSplit(test_fold=validation_set_indexes) search = ms.GridSearchCV( estimator=crf, cv=ps, param_grid=param_grid, scoring=my_scorer, verbose=1, n_jobs=-1, ) search.fit(X_train + X_dev, y_train + y_dev) return search
for (dataName, dataFile) in list(datasetDict.items()): # resDict = dict(zip(estimatorsDict.keys(), [None]*len(estimatorsDict.keys()))) # ============================================================================= # Loading the data set: train and test sets # ============================================================================= print("\n") # Case where a predefined train/test split exists if dataFile.endswith('train'): X_train, y_train = ioFuncs.loadFromTxt(pathJoin(dataPath, dataFile)) if len(X_train)>=10000: # if the dataset is too large, take a 10% fraction, that has a representative class repartition (for memory issues) _, X_train, _, y_train = model_selection.train_test_split(X_train, y_train, test_size=0.1, stratify= y_train) X_test, y_test = ioFuncs.loadFromTxt(pathJoin(dataPath, dataFile[:-5]+'test'), idInd= None, classInd= 0) X, y = np.vstack((X_train, X_test)), np.hstack((y_train, y_test)) testCV = model_selection.PredefinedSplit([-1]*len(X_train) + [0]*len(X_test)) # predefined split specification for sklearn's functions # case with a manual random train/test split, done 100 times else: X, y = ioFuncs.loadFromTxt(pathJoin(dataPath, dataFile)) testCV = model_selection.StratifiedShuffleSplit(n_splits= nbSplits, test_size= 0.3) try: sigmaSquared = np.mean(pdist(X, metric= "sqeuclidean")) except MemoryError: sigmaSquared = np.mean(pdist(X_train, metric= "sqeuclidean")) dataRange = (-1/np.sqrt(X.shape[1]), 1/np.sqrt(X.shape[1])) #[-1/sqrt(d), 1/sqrt(d)] minMaxScaler = preprocessing.MinMaxScaler(feature_range= dataRange) # estimatorsDict["ITML"]["estimator"].steps[0] = ITML_Supervised(num_constraints= int(0.7*len(X))) # special case of ITML: number of constraints = number of landmarks