Пример #1
0
    def fit_sv_pandas(self,
                      df_training,
                      target_column,
                      feature_columns,
                      df_validation=None,
                      ratio_training=None,
                      **kwargs):
        """
        `fit` for pandas DataFrame to perform single validation
        Args:
            df_training (pandas.DataFrame): training data set
            target_column (str): column name of prediction target
            feature_columns (list of str): column names of features
            df_validation (pandas.DataFrame): if specified, used as validation data set
            ratio_training (float): if specified, `df_training` is split for training / validation
            **kwargs: Other keyword arguments for original `fit`

        Returns:
            conjurer.ml.Model
        """
        x, y, num_training, num_validation = _split_for_sv(
            df_training, target_column, feature_columns, df_validation,
            ratio_training)
        self.cv = model_selection.PredefinedSplit(
            numpy.array([-1] * num_training + [0] * num_validation))
        logger.warning("start learning with {} hyper parameters".format(
            self.n_iter))
        self.fit(x, y, **kwargs)
        return model.Model(self,
                           feature_columns=feature_columns,
                           target_column=target_column)
Пример #2
0
def gridsearch_CV_wrapper(params,
                          model,
                          Xtrain,
                          Ytrain,
                          Xtest,
                          Ytest,
                          validation_size=.1,
                          n_jobs=1):
    '''
    Wrapper around gridsearch
    '''
    XvalidIndices, _ = modSel.train_test_split(list(range(0, len(Xtrain))),
                                               train_size=validation_size,
                                               stratify=Ytrain)
    Xvalid = np.full(len(Xtrain), -1)
    for index in XvalidIndices:
        Xvalid[index] = 0
    predef_split = modSel.PredefinedSplit(Xvalid)
    scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score)
    gridSearcher = modSel.GridSearchCV(model,
                                       params,
                                       n_jobs=n_jobs,
                                       cv=predef_split,
                                       scoring=scorer)
    gridSearcher.fit(Xtrain, Ytrain)
    bestLogReg = gridSearcher.best_estimator_
    bestLogReg.fit(Xtrain, Ytrain)
    score = bestLogReg.score(Xtest, Ytest)
    return Dict({"cv_results": gridSearcher.cv_results_, \
            "predictor":bestLogReg, \
            "best_params": gridSearcher.best_params_, \
            "best_score":score, \
            "testTuple": (Xtest, Ytest), \
            "trainTuple": (Xtrain, Ytrain)})
Пример #3
0
    def create(self, X, y):
        indices = np.arange(num_samples(X))
        valid = indices[self.valid]
        train = np.setdiff1d(indices, valid)

        test_fold = np.ones(num_samples(X))
        test_fold[train] = -1

        return model_selection.PredefinedSplit(test_fold)
Пример #4
0
    def __init__(self, k=3, test_folds=None):
        """Initializes the KFoldCrossValidation object.

        The dataset may be split into folds in two ways: automatically or manually. If automatic,
        the `k` argument is to be used. If manual, the user may specify the fold index for each sample in the
        dataset via the `test_folds` argument.

        :param k: the number of folds to uniformly split the data into when using the automatic approach (default 3).
        :type k: int, optional
        :param test_folds: an array specifying the fold index for each sample in the dataset when using
            the manual approach (default None). The entry test_folds[i] specifies the index of the test set that
            sample i belongs to. It is also possible to exclude sample i from any test set (i.e., include sample i
            in every training set) by setting test_folds[i] to -1.
            If `test_folds` is None, the automatic approach is assumed and only the `k` parameter is considered.
            Otherwise, the manual approach is assumed and only the `test_folds` parameter is considered.
        :type test_folds: `numpy.ndarray` or None, optional
        :raises InvalidParameterValueException: if a `k` parameter value less than 2 is used.
        """
        if (k is not None) and (k < 2):
            raise InvalidParameterValueException(
                parameter="k",
                value=k,
                method="K-Fold Cross Validation (K-FCV)",
                is_algorithm=False,
                additional_msg="K-FCV requires at least two folds. "
                "Please choose a value of k=2 or higher.")

        desc = "K-Fold Cross Validation."  # TODO: add longer description
        self._k = k
        self._test_folds = test_folds

        # self._objects = None  # full objects (prior to fold-splitting)
        # self._ranks = None  # full ranks (prior to fold-splitting)
        self._dual_format = None
        # self._folds = None  # only used for manual splitting (via k_col)
        self._kf = None

        if self._test_folds is not None:  # k_col is given
            self._k = None
            self._kf = model_selection.PredefinedSplit(
                self._test_folds)  # k_codes
        else:  # only k is given
            self._kf = model_selection.KFold(n_splits=self._k, random_state=0)

        # call base class constructor
        super().__init__(description=desc,
                         name=EvaluatorType.KFCV.name,
                         k=self._k,
                         manual_test_folds=self._test_folds)
Пример #5
0
def test_train_validation(joinedDataframe, train_size=.8, validation_size=.1):
    '''
    Does a test, train, and validation split, uses cross validation on the validation split
    and returns the results based on the test split
    :param joinedDataframe a dataframe with the binary classifier having column of "angus"
    :param train_validation_size how to split between train set and test set and validation set
    :return a dictionary of results for each param combination, the best LogisticRegression estimator, and the score on the test set
    '''
    X = joinedDataframe.drop(["angus"], axis=1)
    Y = joinedDataframe["angus"]
    #https://stackoverflow.com/questions/34842405/parameter-stratify-from-method-train-test-split-scikit-learn
    Xtrain, Xtest, Ytrain, Ytest = modSel.train_test_split(
        X, Y, train_size=train_size + validation_size, stratify=Y)
    XvalidIndices, _ = modSel.train_test_split(list(range(0, len(Xtrain))),
                                               train_size=validation_size,
                                               stratify=Ytrain)
    Xvalid = np.full(len(Xtrain), -1)
    for index in XvalidIndices:
        Xvalid[index] = 0
    #we set up parameter search space here to look up
    params = Dict()
    params.C = [.4, .8, 1.6, 3.2, 6.4]
    params.tol = [.01]  # to make it easier to converge
    polyParams = Dict(
        params
    )  #copy to a new dict to avoid testing param sets that are the same
    params.kernel = ["linear", "rbf", "sigmoid"]
    params.gamma = ["auto", .001, .01, .1]
    polyParams.kernel = ["poly"]
    polyParams.degree = [1, 2, 3, 4, 5]
    polyParams.coef0 = [0, 1, 5, -1, -5]
    svm = sklearn.svm.SVC(cache_size=7000)
    predef_split = modSel.PredefinedSplit(Xvalid)
    gridSearcher = modSel.GridSearchCV(svm, [params, polyParams],
                                       n_jobs=10,
                                       cv=predef_split)
    gridSearcher.fit(Xtrain, Ytrain)
    bestPredictor = gridSearcher.best_estimator_
    bestPredictor.fit(Xtrain, Ytrain)
    score = bestPredictor.score(Xtest, Ytest)
    return Dict({"cv_results": gridSearcher.cv_results_, \
            "predictor": bestPredictor, \
            "best_score":score, \
            "testTuple": (Xtest, Ytest), \
            "trainTuple": (Xtrain, Ytrain)})
Пример #6
0
def run_grid_search(X_train, y_train, X_dev, y_dev, param_grid, my_scorer):
    crf = sklearn_crfsuite.CRF(
        algorithm='ap',
        all_possible_transitions=True,
        all_possible_states=True,
    )

    validation_set_indexes = [-1] * len(X_train) + [0] * len(X_dev)
    ps = ms.PredefinedSplit(test_fold=validation_set_indexes)

    search = ms.GridSearchCV(
        estimator=crf,
        cv=ps,
        param_grid=param_grid,
        scoring=my_scorer,
        verbose=1,
        n_jobs=-1,
    )

    search.fit(X_train + X_dev, y_train + y_dev)
    return search
Пример #7
0
 for (dataName, dataFile) in list(datasetDict.items()):
     # resDict = dict(zip(estimatorsDict.keys(), [None]*len(estimatorsDict.keys())))
     # =============================================================================
     #  Loading the data set: train and test sets   
     # =============================================================================
     print("\n")
     
 #    Case where a predefined train/test split exists
     if dataFile.endswith('train'):
         X_train, y_train = ioFuncs.loadFromTxt(pathJoin(dataPath, dataFile))
         if len(X_train)>=10000: # if the dataset is too large, take a 10% fraction, that has a representative class repartition (for memory issues)
             _, X_train, _, y_train = model_selection.train_test_split(X_train, y_train, test_size=0.1, stratify= y_train)
         X_test, y_test = ioFuncs.loadFromTxt(pathJoin(dataPath, dataFile[:-5]+'test'), idInd= None, classInd= 0)
         
         X, y = np.vstack((X_train, X_test)), np.hstack((y_train, y_test))
         testCV = model_selection.PredefinedSplit([-1]*len(X_train) + [0]*len(X_test)) # predefined split specification for sklearn's functions
         
 #     case with a manual random train/test split, done 100 times
     else:
         X, y = ioFuncs.loadFromTxt(pathJoin(dataPath, dataFile))
         testCV = model_selection.StratifiedShuffleSplit(n_splits= nbSplits, test_size= 0.3)
     try:    
         sigmaSquared = np.mean(pdist(X, metric= "sqeuclidean"))
     except MemoryError:
         sigmaSquared = np.mean(pdist(X_train, metric= "sqeuclidean"))
         
         
     dataRange = (-1/np.sqrt(X.shape[1]), 1/np.sqrt(X.shape[1])) #[-1/sqrt(d), 1/sqrt(d)]
     minMaxScaler = preprocessing.MinMaxScaler(feature_range= dataRange)
     
 #    estimatorsDict["ITML"]["estimator"].steps[0] = ITML_Supervised(num_constraints= int(0.7*len(X))) # special case of ITML: number of constraints = number of landmarks