예제 #1
0
def check_oversamplers_classifiers(oversamplers, classifiers, random_state,
                                   n_runs):
    """Extract estimators and parameters grids."""

    # Create random states
    random_states = check_random_states(random_state, n_runs)

    # Create estimators and parameter grids
    estimators, param_grids = [], []
    for oversampler, classifier in product(oversamplers, classifiers):

        # Unpack oversamplers and classifiers
        ovs_name, ovs, ovs_param_grid = oversampler
        clf_name, clf, clf_param_grid = classifier
        if ovs is None:
            ovs = FunctionTransformer()

        # Create estimator
        name = f'{ovs_name}|{clf_name}'
        ovs_steps = ovs.steps if isinstance(ovs, Pipeline) else [(ovs_name,
                                                                  ovs)]
        clf_steps = clf.steps if isinstance(clf, Pipeline) else [(clf_name,
                                                                  clf)]
        steps = ovs_steps + clf_steps
        estimators.append((name, Pipeline(steps)))

        # Create parameter grid
        ovs_prefix = f'{name}' if isinstance(
            ovs, Pipeline) else f'{name}__{ovs_name}'
        ovs_param_grid = [{
            f'{ovs_prefix}__{param}': val
            for param, val in param_grid.items()
        } for param_grid in ParameterGrid(ovs_param_grid)]
        clf_prefix = f'{name}' if isinstance(
            clf, Pipeline) else f'{name}__{clf_name}'
        clf_param_grid = [{
            f'{clf_prefix}__{param}': val
            for param, val in param_grid.items()
        } for param_grid in ParameterGrid(clf_param_grid)]
        combinations = product(ovs_param_grid, clf_param_grid, random_states)
        for param_grid1, param_grid2, random_state in combinations:
            param_grid1.update(param_grid2)
            param_grid = {'est_name': [name]}
            for param in ovs.get_params().keys():
                if 'random_state' in param:
                    param_grid.update(
                        {f'{ovs_prefix}__{param}': [random_state]})
            for param in clf.get_params().keys():
                if 'random_state' in param:
                    param_grid.update(
                        {f'{clf_prefix}__{param}': [random_state]})
            param_grid.update(
                {param: [val]
                 for param, val in param_grid1.items()})
            param_grids.append(param_grid)

    return estimators, param_grids
 def grid_search_cv(X, y, g, estimator, param_grid, folds, **kwargs):
     list_param_grid = list(ParameterGrid(param_grid))
     list_param_loss = []
     for param in list_param_grid:
         list_split_loss = []
         for split in folds:
             # Split the train and validation data
             _estimator = copy(estimator)
             X_test, y_test, g_test = [obj[split[1]] for obj in [X, y, g]]
             X_train, y_train, g_train = [
                 obj[split[0]] for obj in [X, y, g]
             ]
             _estimator.set_params(**param)
             _estimator.fit(X=X_train,
                            y=y_train,
                            g=g_train,
                            **{
                                name: value[split[0]]
                                for name, value in kwargs.items()
                            })
             pred = _estimator.predict(X_test)
             tol = transformed_outcome_loss(
                 pred, y_test, g_test)  # Minimize transformed outcome loss
             list_split_loss.append(tol)
         list_param_loss.append(np.mean(list_split_loss))
     return list_param_grid[list_param_loss.index(min(list_param_loss))]
예제 #3
0
    def fit(self, X, y=None, *, groups=None, **fit_params):
        """
        TODO
        :param groups:
        :param X:
        :param y:
        :return:
        """
        self.all_cv_results_ = {}
        self.all_best_estimator_ = {}
        self.all_best_score_ = {}
        self.all_best_params_ = {}
        self.all_best_index_ = {}
        self.all_scorer_ = {}
        self.all_n_splits_ = {}
        self.all_refit_time_ = {}
        self.all_multimetric_ = {}
        for params in ParameterGrid(self.params):
            estimator = clone(self.estimator).set_params(**params)
            result = GridSearchCV.fit

        def evaluate_candidates(searches):

            for name, search, params, *kwargs in searches:
                if len(kwargs) == 1:
                    result = search(self.estimator,
                                    params,
                                    refit=True,
                                    **kwargs[0]).fit(X, y)
                else:
                    result = search(self.estimator, params,
                                    refit=True).fit(X, y)
                # Save the attributes of the intermediate search results
                # TODO: Should we add a flag to just keep the results of the final optimization step?
                # This would make the object smaller but we cannot check plausibility of previous optimization steps.
                self.all_cv_results_[name] = result.cv_results_
                self.all_best_estimator_[name] = result.best_estimator_
                self.all_best_score_[name] = result.best_score_
                self.all_best_params_[name] = result.best_params_
                self.all_best_index_[name] = result.best_index_
                self.all_scorer_[name] = result.scorer_
                self.all_n_splits_[name] = result.n_splits_
                self.all_refit_time_[name] = result.refit_time_
                self.all_multimetric_[name] = result.multimetric_

                self.estimator = result.best_estimator_

        self._run_search(evaluate_candidates)
        return self
    def grid_search_cv_hurdle(X, y, g, estimator, param_grid_conversion,
                              param_grid_regression, folds, **kwargs):
        list_param_grid = list(
            itertools.product(list(ParameterGrid(param_grid_conversion)),
                              list(ParameterGrid(param_grid_regression))))
        list_param_loss = []
        for param in list_param_grid:
            list_split_loss = []
            for split in folds:
                # Split the train and validation data
                _estimator = copy(estimator)
                X_test, y_test, g_test = [obj[split[1]] for obj in [X, y, g]]
                X_train, y_train, g_train = [
                    obj[split[0]] for obj in [X, y, g]
                ]

                for model in [
                        _estimator.treatment_group_model,
                        _estimator.control_group_model
                ]:
                    model.conversion_classifier.set_params(**param[0])
                    model.value_regressor.set_params(**param[1])

                _estimator.fit(X=X_train,
                               y=y_train,
                               g=g_train,
                               **{
                                   name: value[split[0]]
                                   for name, value in kwargs.items()
                               })
                pred = _estimator.predict(X_test)
                tol = transformed_outcome_loss(
                    pred, y_test, g_test)  # Minimize transformed outcome loss
                list_split_loss.append(tol)
            list_param_loss.append(np.mean(list_split_loss))
        return list_param_grid[list_param_loss.index(min(list_param_loss))]
예제 #5
0
def data_table(model, scenario_inputs, outputs):
    '''Create n-inputs by m-outputs data table. 

    Parameters
    ----------
    model : object
        User defined object containing the appropriate methods and properties for computing outputs from inputs
    scenario_inputs : dict of str to sequence
        Keys are input variable names and values are sequence of values for each scenario for this variable.
        
        Is consumed by scikit-learn ParameterGrid() function. See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html
    outputs : list of str
        List of output variable names

    Returns
    -------
    results_df : pandas DataFrame
        Contains values of all outputs for every combination of scenario inputs
    '''

    # Clone the model using deepcopy
    model_clone = copy.deepcopy(model)

    # Create parameter grid
    dt_param_grid = list(ParameterGrid(scenario_inputs))

    # Create the table as a list of dictionaries
    results = []

    # Loop over the scenarios
    for params in dt_param_grid:
        # Update the model clone with scenario specific values
        model_clone.update(params)
        # Create a result dictionary based on a copy of the scenario inputs
        result = copy.copy(params)
        # Loop over the list of requested outputs
        for output in outputs:
            # Compute the output.
            out_val = getattr(model_clone, output)()
            # Add the output to the result dictionary
            result[output] = out_val

        # Append the result dictionary to the results list
        results.append(result)

    # Convert the results list (of dictionaries) to a pandas DataFrame and return it
    results_df = pd.DataFrame(results)
    return results_df
예제 #6
0
def check_param_grids(param_grids, est_names):
    """Check the parameters grids to use with
    parametrized estimators."""

    # Check the parameters grids
    flat_param_grids = [
        param_grid for param_grid in list(ParameterGrid(param_grids))
        if param_grid
    ]

    # Append existing estimators names
    param_grids = []
    for param_grid in flat_param_grids:

        # Get estimator name
        est_name = param_grid.pop('est_name', None)

        # Modify values
        param_grid = {param: [val] for param, val in param_grid.items()}

        # Check estimators prefixes
        params_prefixes = set(
            [param.split('__')[0] for param in param_grid.keys()])
        if not params_prefixes.issubset(est_names):
            raise ValueError(
                'Parameters prefixes are not subset of parameter `est_names`.')
        if len(params_prefixes) > 1:
            raise ValueError('Parameters prefixes are not unique.')
        if est_name is not None and len(params_prefixes.union([est_name])) > 1:
            raise ValueError(
                'Parameters prefixes and parameter `est_name` are not unique.')
        param_grid['est_name'] = ([est_name] if est_name is not None else
                                  list(params_prefixes))

        # Append parameter grid
        param_grids.append(param_grid)

    # Append missing estimators names
    current_est_names = set(
        [param_grid['est_name'][0] for param_grid in param_grids])
    missing_est_names = set(est_names).difference(current_est_names)
    for est_name in missing_est_names:
        param_grids.append({'est_name': [est_name]})

    return param_grids
예제 #7
0
    def fit(self, X, y=None, groups=None):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """
        return self._fit(X, y, groups, ParameterGrid(self.param_grid))
예제 #8
0
 def _run_search(self, evaluate_candidates):
     """Search all candidates in param_grid"""
     evaluate_candidates(ParameterGrid(self.param_grid))
예제 #9
0
    def fit(self, X, y=None, groups=None):
        estimator = self.default_estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        # Regenerate parameter iterable for each fit
        candidate_params = ParameterGrid(self.param_grid)
        n_candidates = len(candidate_params)
        candidate_untrainable_params = ParameterGrid(self.untrainable_param_grid)
        untrainable_candidates = len(candidate_untrainable_params)
        self.logger.i("[CV] Fitting {} folds for each of {} candidates, totalling"
                      " {} fits".format(n_splits, n_candidates, n_candidates * n_splits))

        base_estimator = clone(self.default_estimator)
        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.logger.level * 20,
            pre_dispatch=pre_dispatch
        )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.me,
                                  train, test, self.logger, parameters,
                                  candidate_untrainable_params,
                                  return_n_test_samples=True,
                                  return_times=True)
          for train, test in cv.split(X, y, groups)
          for parameters in candidate_params)

        out = np.vstack([o for o in out])
        test_accuracy = out[:, 0]
        test_precision = out[:, 1]
        test_recall = out[:, 2]
        test_f1 = out[:, 3]
        test_distance = out[:, 4]
        test_count = out[:, 5]
        test_count_pct = out[:, 6]
        test_raw_count = out[:, 7]
        test_raw_count_pct = out[:, 8]
        test_density = out[:, 9]
        test_raw_density = out[:, 10]
        test_sample_counts = out[:, 11]
        fit_time = out[:, 12]
        score_time = out[:, 13]

        results = dict()
        n_tot_candidates = n_candidates * untrainable_candidates
        tot_candidate_params = list(itertools.product(list(candidate_params), list(candidate_untrainable_params)))

        def _store(key_name, array, weights=None, splits=False, rank=False, error=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            array = np.array(array, dtype=np.float64).reshape(n_splits, n_tot_candidates).T
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s" % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                arr = array_means if error else -array_means
                results["rank_%s" % key_name] = np.asarray(rankdata(arr, method='min'), dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        if self.iid:
            test_sample_counts = np.array(test_sample_counts[::n_tot_candidates], dtype=np.int)
        else:
            test_sample_counts = None

        _store('accuracy_score', test_accuracy, splits=True, rank=True, weights=test_sample_counts)
        _store('precision_score', test_precision, splits=True, rank=True, weights=test_sample_counts)
        _store('recall_score', test_recall, splits=True, rank=True, weights=test_sample_counts)
        _store('f1_score', test_f1, splits=True, rank=True, weights=test_sample_counts)
        _store('distance_mae', test_distance, splits=True, rank=True, weights=test_sample_counts, error=True)
        _store('count_mae', test_count, splits=True, rank=True, weights=test_sample_counts, error=True)
        _store('count_pct_mae', test_count_pct, splits=True, rank=True, weights=test_sample_counts, error=True)
        _store('raw_count_mae', test_raw_count, splits=True, rank=True, weights=test_sample_counts, error=True)
        _store('raw_count_pct_mae', test_raw_count_pct, splits=True, rank=True, weights=test_sample_counts, error=True)
        _store('density_mae', test_density, splits=True, rank=True, weights=test_sample_counts, error=True)
        _store('raw_density_mae', test_raw_density, splits=True, rank=True, weights=test_sample_counts, error=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)
        results['rank_custom'] = np.asarray(rankdata((results['rank_f1_score'] + results['rank_count_pct_mae']) / 2,
                                                     method='min'), dtype=np.int32)

        best_index = np.flatnonzero(results['rank_custom'])[0]
        best_parameters = tot_candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray, np.empty(n_tot_candidates, ), mask=True, dtype=object))
        for cand_i, params in enumerate(tot_candidate_params):
            params = merge_dicts(*params)
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = tot_candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            bp = best_parameters[0]
            bp.update(best_parameters[1])
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**bp)
            best_estimator.fit(X, y)
            self.best_estimator_ = best_estimator
        return self
예제 #10
0
 def _get_param_iterator(self):
     """ Return ParameterGrid instance for the given param_grid """
     return ParameterGrid(self.param_grid)
예제 #11
0
def simulate(model,
             random_inputs,
             outputs,
             scenario_inputs=None,
             keep_random_inputs=False):
    '''Simulate model for one or more scenarios

    Parameters
    ----------
    model : object
        User defined object containing the appropriate methods and properties for computing outputs from inputs
    random_intputs : dict of str to sequence of random variates
        Keys are stochastic input variable names and values are sequence of $n$ random variates, where $n$ is the number of simulation replications
    outputs : list of str
        List of output variable names
    scenario_inputs : optional (default is None), dict of str to sequence
        Keys are deterministic input variable names and values are sequence of values for each scenario for this variable. Is consumed by
        scikit-learn ParameterGrid() function. See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html
    keep_random_inputs : optional (default is False), boolean
        If True, all the random input variates are included in the results dataframe

    Returns
    -------
    results_df : pandas DataFrame
        Values of all outputs for each simulation replication. If `scenario_inputs` is not None, then this is also for every combination of scenario inputs
    '''

    # Clone the model
    model_clone = copy.deepcopy(model)

    # Update clone with random_inputs
    model_clone.update(random_inputs)

    # Store raw simulation input values if desired
    if keep_random_inputs:
        scenario_base_vals = vars(model_clone)
    else:
        scenario_base_vals = vars(model)

    # Initialize output counters and containers
    scenario_num = 0
    scenario_results = []

    # Check if multiple scenarios
    if scenario_inputs is not None:
        # Create parameter grid for scenario inputs
        sim_param_grid = list(ParameterGrid(scenario_inputs))

        # Scenario loop
        for params in sim_param_grid:
            model_clone.update(params)
            # Initialize scenario related outputs
            result = {}
            scenario_vals = copy.copy(params)
            result['scenario_base_vals'] = scenario_base_vals
            result['scenario_num'] = scenario_num
            result['scenario_vals'] = scenario_vals
            raw_output = {}

            # Output measure loop
            for output_name in outputs:
                output_array = getattr(model_clone, output_name)()
                raw_output[output_name] = output_array

            # Gather results for this scenario
            result['output'] = raw_output
            scenario_results.append(result)
            scenario_num += 1

        return scenario_results

    else:
        # Similar logic to above, but only a single scenario
        results = []
        result = {}

        result['scenario_base_vals'] = scenario_base_vals
        result['scenario_num'] = scenario_num
        result['scenario_vals'] = {}

        raw_output = {}
        for output_name in outputs:
            output_array = getattr(model_clone, output_name)()
            raw_output[output_name] = output_array

        result['output'] = raw_output
        results.append(result)

        return results
예제 #12
0
def find_hparam_regression(estimator,
                           X,
                           y,
                           param_grid,
                           cv,
                           scaler=None,
                           n_jobs=1):
    """
    Task: find the hyper-parameter from a set of parameters (param_grid),
          that performs best in an cross-validation setting for the given
          estimator.

    :param estimator: Estimator object, e.g. KernelRankSVC

    :param X: dictionary, (mol-id, system)-tuples as keys and molecular
              features as values:

              Example:
                {("M1", "S1"): feat_11, ...}

    :param y: dictionary, (mol-id, system)-tuples as keys and retention
              times as values

              Example:
                {("M1", "S1"): rt_11, ...}

    :param param_grid: dictionary, defining the grid-search space
        "C": Trade-of parameter for the SVM
        "gamma": width of the rbf/gaussian kernel
        ... etc. ...

        Example:
            {"C": [0.1, 1, 10], "gamma": [0.1, 0.25, 0.5, 1]}

    :param cv: cross-validation generator, see sklearn package, must be
               either a GroupKFold or GroupShuffleSplit object.

    :param scaler: scaler object, per feature scaler, e.g. MinMaxScaler

    :param n_jobs: int, number of jobs run in parallel. Parallelization
                   is performed over the cv-folds.

    :return: dictionary, containing combination of best parameters
                Example:
                    {"C": 1, "gamma": 0.25}

             dictionary, all parameter combinations with corresponding scores
                 Example:
                    [{"C": 1, "gamma": 0.25, "score": 0.98},
                     {"C": 1, "gamma": 0.50, "score": 0.94},
                     ...]

             scalar, number of pairs used to train the final model

             estimator object, fitted using the best parameters
    """
    if not len(X) == len(X) or len(X.keys() - y.keys()) or len(y.keys() -
                                                               X.keys()):
        raise ValueError(
            "Keys-set for features and retentions times must be equal.")

    if not isinstance(estimator, SVRPairwise):
        raise ValueError(
            "Currently parameters can only be estimated for the support vector regression "
            "class 'SVRPairwise'.")

    # Make a list of all combinations of parameters
    l_params = list(ParameterGrid(param_grid))
    param_scores = np.zeros((len(l_params), ))

    # Get all (mol-id, system)-tuples used for the parameter search
    keys = list(X.keys())

    mol_ids = list(zip(*keys))[0]
    cv_splits = cv.split(range(len(keys)), groups=mol_ids)

    # Precompute the training / test targets to save computation time as
    # we do not need to repeat this for several parameter settings.
    # cv_splits = cv.split (range (len (keys)))
    y_train_sets, y_test_sets = [], []
    X_train_sets, X_test_sets = [], []

    print("Get pairs for hparam estimation: ", end="", flush=True)
    for k_cv, (train_set, test_set) in enumerate(cv_splits):
        print("%d " % k_cv, end="", flush=True)

        # 0) Get keys (mol-id, system)-tuples, corresponding to the training
        #    and test sets.
        keys_train = [keys[idx] for idx in train_set]
        keys_test = [keys[idx] for idx in test_set]

        # Check for overlap of molecular ids, e.g. InChIs. Between training and test
        # molecular ids should not be shared, e.g. if they appear in different systems
        # at the same time.
        mol_ids_train = [mol_ids[idx] for idx in train_set]
        mol_ids_test = [mol_ids[idx] for idx in test_set]

        if set(mol_ids_train) & set(mol_ids_test):
            if isinstance(cv, GroupKFold) or isinstance(cv, GroupShuffleSplit):
                raise RuntimeError(
                    "As grouped cross-validation is used the training "
                    "and test molecules, i.e. mol_ids, are not allowed "
                    "to overlap. This can happen if molecular structures "
                    "are appearing in different systems. During the "
                    "learning of hyper-parameter the training set should "
                    "not contain any structure also in the test set.",
                    set(mol_ids_train) & set(mol_ids_test))
            else:
                print("Training and test keys overlaps.",
                      set(mol_ids_train) & set(mol_ids_test))

        # 1) Extract the target values from y (train and test) using the keys
        y_train_sets.append(np.array([y[key] for key in keys_train]))
        y_test_sets.append(np.array([y[key] for key in keys_test]))

        # 2) Extract the features from X (train and test) using the keys
        X_train_sets.append(np.array([X[key] for key in keys_train]))
        X_test_sets.append(np.array([X[key] for key in keys_test]))

    print("")

    for k_param, param in enumerate(l_params):
        fold_scores = Parallel(n_jobs=n_jobs, verbose=False)(
            delayed(_fit_and_score_regression)(
                param, clone(estimator), X_train_sets[k_cv], X_test_sets[k_cv],
                y_train_sets[k_cv], y_test_sets[k_cv], scaler)
            for k_cv in range(cv.get_n_splits()))

        param_scores[k_param] = np.mean(fold_scores)

    ## Fit model using the best parameters
    # Find the best params
    best_params = l_params[np.argmax(param_scores)].copy()

    # Fit the model using the best parameters
    best_estimator = clone(estimator)
    best_estimator.set_params(**_filter_params(best_params, best_estimator))

    X = np.array([X[key] for key in keys])
    y = np.array([y[key] for key in keys])

    if not scaler is None:
        X = scaler.transform(X)

    best_estimator.fit(X, y)

    # Combine the mean fold scores with the list of parameter sets
    for k_param, _ in enumerate(l_params):
        l_params[k_param]["score"] = param_scores[k_param]

    return best_params, l_params, -1, best_estimator
예제 #13
0
def find_hparan_ranksvm(estimator,
                        X,
                        y,
                        param_grid,
                        cv,
                        pair_params,
                        scaler=None,
                        n_jobs=1,
                        fold_score_aggregation="weighted_average",
                        all_pairs_as_test=True):
    """
    Task: find the hyper-parameter from a set of parameters (param_grid),
          that performs best in an cross-validation setting for the given
          estimator.

    :param estimator: Estimator object, e.g. KernelRankSVC

    :param X: dictionary, (mol-id, system)-tuples as keys and molecular
              features as values:

              Example:
                {("M1", "S1"): feat_11, ...}

    :param y: dictionary, (mol-id, system)-tuples as keys and retention
              times as values

              Example:
                {("M1", "S1"): rt_11, ...}

    :param param_grid: dictionary, defining the grid-search space
        "C": Trade-of parameter for the SVM
        "gamma": width of the rbf/gaussian kernel
        ... etc. ...

        Example:
            {"C": [0.1, 1, 10], "gamma": [0.1, 0.25, 0.5, 1]}

    :param cv: cross-validation generator, see sklearn package, must be
               either a GroupKFold or GroupShuffleSplit object.

    :param pair_params: dictionary, specifying parameters for the order graph:
        "ireverse": scalar, Should cross-system elution transitivity be included
            0: no, 1: yes
        "d_lower": scalar, minimum distance of two molecules in the elution order graph
                   to be considered as a pair.
        "d_upper": scalar, maximum distance of two molecules in the elution order graph
                   to be considered as a pair.
        "allow_overlap": scalar, Should overlap between the upper and lower sets
                         be allowed. Those overlaps originate from retention order
                         contradictions between the different systems.

    :param scaler: scaler object, per feature scaler, e.g. MinMaxScaler

    :param n_jobs: integer, number of jobs run in parallel. Parallelization is performed
        over the cv-folds. (default = 1)

    :fold_score_aggregation: string, (default = "weighted_average")

    :all_pairs_as_test: boolean, should all possible pairs (d_lower = 0, d_upper = np.inf)
        be used during the test. If 'False' than corresponding values are taking from the
        'pair_params' dictionary. (default = True)

    :return: dictionary, containing combination of best parameters
                Example:
                    {"C": 1, "gamma": 0.25}

             dictionary, all parameter combinations with corresponding scores
                 Example:
                    [{"C": 1, "gamma": 0.25, "score": 0.98},
                     {"C": 1, "gamma": 0.50, "score": 0.94},
                     ...]

             scalar, number of pairs used to train the final model

             estimator object, fitted using the best parameters
    """
    if not (isinstance(cv, GroupKFold) or isinstance(cv, GroupShuffleSplit)):
        raise ValueError("Cross-validation generator must be either of "
                         "class 'GroupKFold' or 'GroupShuffleSplit'. "
                         "Provided class is '%s'." % cv.__class__.__name__)

    if len(X) != len(y) or len(X.keys() - y.keys()) or len(y.keys() -
                                                           X.keys()):
        raise ValueError("Keys-set for features and retentions times must "
                         "be equal.")

    # Make a list of all combinations of parameters
    l_params = list(ParameterGrid(param_grid))
    param_scores = np.zeros((len(l_params), ))

    # Get all (mol-id, system)-tuples used for the parameter search
    keys = list(X.keys())

    if len(l_params) > 1:
        mol_ids = list(zip(*keys))[0]
        cv_splits = cv.split(range(len(keys)), groups=mol_ids)

        # Precompute the training / test pairs to save computation time as
        # we do not need to repeat this for several parameter settings.
        pairs_train_sets, pairs_test_sets = [], []
        X_train_sets, X_test_sets = [], []
        n_pairs_test_sets = []

        print("Get pairs for hparam estimation: ", end="", flush=True)
        for k_cv, (train_set, test_set) in enumerate(cv_splits):
            print("%d " % k_cv, end="", flush=True)

            # 0) Get keys (mol-id, system)-tuples, corresponding to the training
            #    and test sets.
            keys_train = [keys[idx] for idx in train_set]
            keys_test = [keys[idx] for idx in test_set]

            # Check for overlap of molecular ids, e.g. InChIs. Between training and test
            # molecular ids should not be shared, e.g. if they appear in different systems
            # at the same time.
            mol_ids_train = [mol_ids[idx] for idx in train_set]
            mol_ids_test = [mol_ids[idx] for idx in test_set]

            if set(mol_ids_train) & set(mol_ids_test):
                if isinstance(cv, GroupKFold) or isinstance(
                        cv, GroupShuffleSplit):
                    raise RuntimeError(
                        "As grouped cross-validation is used the training "
                        "and test molecules, i.e. mol_ids, are not allowed "
                        "to overlap. This can happen if molecular structures "
                        "are appearing in different systems. During the "
                        "learning of hyper-parameter the training set should "
                        "not contain any structure also in the test set.",
                        set(mol_ids_train) & set(mol_ids_test))
                else:
                    print("Training and test keys overlaps.",
                          set(mol_ids_train) & set(mol_ids_test))

            # 1) Extract the target values from y (train and test) using the keys
            y_train, y_test = OrderedDict(), OrderedDict()
            for key in keys_train:
                y_train[key] = y[key]
            for key in keys_test:
                y_test[key] = y[key]

            # 2) Calculate the pairs (train and test)
            cretention_train, cretention_test = retention_cls(), retention_cls(
            )

            #   a) load 'lrows' in the retention_cls
            cretention_train.load_data_from_target(y_train)
            cretention_test.load_data_from_target(y_test)

            #   b) build the digraph
            cretention_train.make_digraph(ireverse=pair_params["ireverse"])
            cretention_test.make_digraph(ireverse=pair_params["ireverse"])

            #   c) find the upper and lower set
            cretention_train.dmolecules_inv = cretention_train.invert_dictionary(
                cretention_train.dmolecules)
            cretention_train.dcollections_inv = cretention_train.invert_dictionary(
                cretention_train.dcollections)
            cretention_test.dmolecules_inv = cretention_test.invert_dictionary(
                cretention_test.dmolecules)
            cretention_test.dcollections_inv = cretention_test.invert_dictionary(
                cretention_test.dcollections)

            #   d) get the pairs from the upper and lower sets
            pairs_train = get_pairs_from_order_graph(
                cretention_train,
                keys_train,
                allow_overlap=pair_params["allow_overlap"],
                n_jobs=n_jobs,
                d_lower=pair_params["d_lower"],
                d_upper=pair_params["d_upper"])
            pairs_train_sets.append(pairs_train)

            if all_pairs_as_test:
                pairs_test = get_pairs_from_order_graph(
                    cretention_test,
                    keys_test,
                    allow_overlap=pair_params["allow_overlap"],
                    n_jobs=n_jobs,
                    d_lower=0,
                    d_upper=np.inf)
            else:
                pairs_test = get_pairs_from_order_graph(
                    cretention_test,
                    keys_test,
                    allow_overlap=pair_params["allow_overlap"],
                    n_jobs=n_jobs,
                    d_lower=pair_params["d_lower"],
                    d_upper=pair_params["d_upper"])

            pairs_test_sets.append(pairs_test)
            n_pairs_test_sets.append(len(pairs_test))

            # 3) Extract the features from X (train and test) using the keys
            X_train_sets.append(np.array([X[key] for key in keys_train]))
            X_test_sets.append(np.array([X[key] for key in keys_test]))

        print("")

        for k_param, param in enumerate(l_params):
            # Calculate the absolute number of correctly classified pairs
            # for each fold.
            fold_scores = Parallel(n_jobs=n_jobs, verbose=False)(
                delayed(_fit_and_score_ranksvm)(param.copy(), clone(
                    estimator), X_train_sets[k_cv], X_test_sets[k_cv],
                                                pairs_train_sets[k_cv],
                                                pairs_test_sets[k_cv], scaler)
                for k_cv in range(cv.get_n_splits()))

            if fold_score_aggregation == "average":
                param_scores[k_param] = np.mean(fold_scores /
                                                np.array(n_pairs_test_sets))
            elif fold_score_aggregation == "weighted_average":
                param_scores[k_param] = np.sum(fold_scores) / np.sum(
                    n_pairs_test_sets)
            else:
                raise ValueError("Invalid fold-scoring aggregation: %s." %
                                 fold_score_aggregation)

    ## Fit model using the best parameters
    # Find the best params
    best_params = l_params[np.argmax(param_scores)].copy()

    # Fit the model using the best parameters
    best_estimator = clone(estimator)
    best_estimator.set_params(**_filter_params(best_params, best_estimator))

    # Build retention order graph
    cretention = retention_cls()
    cretention.load_data_from_target(y)
    cretention.make_digraph(ireverse=pair_params["ireverse"])
    cretention.dmolecules_inv = cretention.invert_dictionary(
        cretention.dmolecules)
    cretention.dcollections_inv = cretention.invert_dictionary(
        cretention.dcollections)

    pairs = get_pairs_from_order_graph(
        cretention,
        keys,
        allow_overlap=pair_params["allow_overlap"],
        n_jobs=n_jobs,
        d_lower=pair_params["d_lower"],
        d_upper=pair_params["d_upper"])
    n_pairs_train = len(pairs)
    X = np.array([X[key] for key in keys])

    if scaler is not None:
        X = scaler.transform(X)

    fit_params = {"FX": X, "pairs": pairs}

    best_estimator.fit(None, y=None, fit_params=fit_params)

    # Combine the mean fold scores with the list of parameter sets
    for k_param, _ in enumerate(l_params):
        l_params[k_param]["score"] = param_scores[k_param]

    return best_params, l_params, n_pairs_train, best_estimator, X, None