def check_oversamplers_classifiers(oversamplers, classifiers, random_state, n_runs): """Extract estimators and parameters grids.""" # Create random states random_states = check_random_states(random_state, n_runs) # Create estimators and parameter grids estimators, param_grids = [], [] for oversampler, classifier in product(oversamplers, classifiers): # Unpack oversamplers and classifiers ovs_name, ovs, ovs_param_grid = oversampler clf_name, clf, clf_param_grid = classifier if ovs is None: ovs = FunctionTransformer() # Create estimator name = f'{ovs_name}|{clf_name}' ovs_steps = ovs.steps if isinstance(ovs, Pipeline) else [(ovs_name, ovs)] clf_steps = clf.steps if isinstance(clf, Pipeline) else [(clf_name, clf)] steps = ovs_steps + clf_steps estimators.append((name, Pipeline(steps))) # Create parameter grid ovs_prefix = f'{name}' if isinstance( ovs, Pipeline) else f'{name}__{ovs_name}' ovs_param_grid = [{ f'{ovs_prefix}__{param}': val for param, val in param_grid.items() } for param_grid in ParameterGrid(ovs_param_grid)] clf_prefix = f'{name}' if isinstance( clf, Pipeline) else f'{name}__{clf_name}' clf_param_grid = [{ f'{clf_prefix}__{param}': val for param, val in param_grid.items() } for param_grid in ParameterGrid(clf_param_grid)] combinations = product(ovs_param_grid, clf_param_grid, random_states) for param_grid1, param_grid2, random_state in combinations: param_grid1.update(param_grid2) param_grid = {'est_name': [name]} for param in ovs.get_params().keys(): if 'random_state' in param: param_grid.update( {f'{ovs_prefix}__{param}': [random_state]}) for param in clf.get_params().keys(): if 'random_state' in param: param_grid.update( {f'{clf_prefix}__{param}': [random_state]}) param_grid.update( {param: [val] for param, val in param_grid1.items()}) param_grids.append(param_grid) return estimators, param_grids
def grid_search_cv(X, y, g, estimator, param_grid, folds, **kwargs): list_param_grid = list(ParameterGrid(param_grid)) list_param_loss = [] for param in list_param_grid: list_split_loss = [] for split in folds: # Split the train and validation data _estimator = copy(estimator) X_test, y_test, g_test = [obj[split[1]] for obj in [X, y, g]] X_train, y_train, g_train = [ obj[split[0]] for obj in [X, y, g] ] _estimator.set_params(**param) _estimator.fit(X=X_train, y=y_train, g=g_train, **{ name: value[split[0]] for name, value in kwargs.items() }) pred = _estimator.predict(X_test) tol = transformed_outcome_loss( pred, y_test, g_test) # Minimize transformed outcome loss list_split_loss.append(tol) list_param_loss.append(np.mean(list_split_loss)) return list_param_grid[list_param_loss.index(min(list_param_loss))]
def fit(self, X, y=None, *, groups=None, **fit_params): """ TODO :param groups: :param X: :param y: :return: """ self.all_cv_results_ = {} self.all_best_estimator_ = {} self.all_best_score_ = {} self.all_best_params_ = {} self.all_best_index_ = {} self.all_scorer_ = {} self.all_n_splits_ = {} self.all_refit_time_ = {} self.all_multimetric_ = {} for params in ParameterGrid(self.params): estimator = clone(self.estimator).set_params(**params) result = GridSearchCV.fit def evaluate_candidates(searches): for name, search, params, *kwargs in searches: if len(kwargs) == 1: result = search(self.estimator, params, refit=True, **kwargs[0]).fit(X, y) else: result = search(self.estimator, params, refit=True).fit(X, y) # Save the attributes of the intermediate search results # TODO: Should we add a flag to just keep the results of the final optimization step? # This would make the object smaller but we cannot check plausibility of previous optimization steps. self.all_cv_results_[name] = result.cv_results_ self.all_best_estimator_[name] = result.best_estimator_ self.all_best_score_[name] = result.best_score_ self.all_best_params_[name] = result.best_params_ self.all_best_index_[name] = result.best_index_ self.all_scorer_[name] = result.scorer_ self.all_n_splits_[name] = result.n_splits_ self.all_refit_time_[name] = result.refit_time_ self.all_multimetric_[name] = result.multimetric_ self.estimator = result.best_estimator_ self._run_search(evaluate_candidates) return self
def grid_search_cv_hurdle(X, y, g, estimator, param_grid_conversion, param_grid_regression, folds, **kwargs): list_param_grid = list( itertools.product(list(ParameterGrid(param_grid_conversion)), list(ParameterGrid(param_grid_regression)))) list_param_loss = [] for param in list_param_grid: list_split_loss = [] for split in folds: # Split the train and validation data _estimator = copy(estimator) X_test, y_test, g_test = [obj[split[1]] for obj in [X, y, g]] X_train, y_train, g_train = [ obj[split[0]] for obj in [X, y, g] ] for model in [ _estimator.treatment_group_model, _estimator.control_group_model ]: model.conversion_classifier.set_params(**param[0]) model.value_regressor.set_params(**param[1]) _estimator.fit(X=X_train, y=y_train, g=g_train, **{ name: value[split[0]] for name, value in kwargs.items() }) pred = _estimator.predict(X_test) tol = transformed_outcome_loss( pred, y_test, g_test) # Minimize transformed outcome loss list_split_loss.append(tol) list_param_loss.append(np.mean(list_split_loss)) return list_param_grid[list_param_loss.index(min(list_param_loss))]
def data_table(model, scenario_inputs, outputs): '''Create n-inputs by m-outputs data table. Parameters ---------- model : object User defined object containing the appropriate methods and properties for computing outputs from inputs scenario_inputs : dict of str to sequence Keys are input variable names and values are sequence of values for each scenario for this variable. Is consumed by scikit-learn ParameterGrid() function. See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html outputs : list of str List of output variable names Returns ------- results_df : pandas DataFrame Contains values of all outputs for every combination of scenario inputs ''' # Clone the model using deepcopy model_clone = copy.deepcopy(model) # Create parameter grid dt_param_grid = list(ParameterGrid(scenario_inputs)) # Create the table as a list of dictionaries results = [] # Loop over the scenarios for params in dt_param_grid: # Update the model clone with scenario specific values model_clone.update(params) # Create a result dictionary based on a copy of the scenario inputs result = copy.copy(params) # Loop over the list of requested outputs for output in outputs: # Compute the output. out_val = getattr(model_clone, output)() # Add the output to the result dictionary result[output] = out_val # Append the result dictionary to the results list results.append(result) # Convert the results list (of dictionaries) to a pandas DataFrame and return it results_df = pd.DataFrame(results) return results_df
def check_param_grids(param_grids, est_names): """Check the parameters grids to use with parametrized estimators.""" # Check the parameters grids flat_param_grids = [ param_grid for param_grid in list(ParameterGrid(param_grids)) if param_grid ] # Append existing estimators names param_grids = [] for param_grid in flat_param_grids: # Get estimator name est_name = param_grid.pop('est_name', None) # Modify values param_grid = {param: [val] for param, val in param_grid.items()} # Check estimators prefixes params_prefixes = set( [param.split('__')[0] for param in param_grid.keys()]) if not params_prefixes.issubset(est_names): raise ValueError( 'Parameters prefixes are not subset of parameter `est_names`.') if len(params_prefixes) > 1: raise ValueError('Parameters prefixes are not unique.') if est_name is not None and len(params_prefixes.union([est_name])) > 1: raise ValueError( 'Parameters prefixes and parameter `est_name` are not unique.') param_grid['est_name'] = ([est_name] if est_name is not None else list(params_prefixes)) # Append parameter grid param_grids.append(param_grid) # Append missing estimators names current_est_names = set( [param_grid['est_name'][0] for param_grid in param_grids]) missing_est_names = set(est_names).difference(current_est_names) for est_name in missing_est_names: param_grids.append({'est_name': [est_name]}) return param_grids
def fit(self, X, y=None, groups=None): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ return self._fit(X, y, groups, ParameterGrid(self.param_grid))
def _run_search(self, evaluate_candidates): """Search all candidates in param_grid""" evaluate_candidates(ParameterGrid(self.param_grid))
def fit(self, X, y=None, groups=None): estimator = self.default_estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) # Regenerate parameter iterable for each fit candidate_params = ParameterGrid(self.param_grid) n_candidates = len(candidate_params) candidate_untrainable_params = ParameterGrid(self.untrainable_param_grid) untrainable_candidates = len(candidate_untrainable_params) self.logger.i("[CV] Fitting {} folds for each of {} candidates, totalling" " {} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.default_estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.logger.level * 20, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.me, train, test, self.logger, parameters, candidate_untrainable_params, return_n_test_samples=True, return_times=True) for train, test in cv.split(X, y, groups) for parameters in candidate_params) out = np.vstack([o for o in out]) test_accuracy = out[:, 0] test_precision = out[:, 1] test_recall = out[:, 2] test_f1 = out[:, 3] test_distance = out[:, 4] test_count = out[:, 5] test_count_pct = out[:, 6] test_raw_count = out[:, 7] test_raw_count_pct = out[:, 8] test_density = out[:, 9] test_raw_density = out[:, 10] test_sample_counts = out[:, 11] fit_time = out[:, 12] score_time = out[:, 13] results = dict() n_tot_candidates = n_candidates * untrainable_candidates tot_candidate_params = list(itertools.product(list(candidate_params), list(candidate_untrainable_params))) def _store(key_name, array, weights=None, splits=False, rank=False, error=False): """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters array = np.array(array, dtype=np.float64).reshape(n_splits, n_tot_candidates).T if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: arr = array_means if error else -array_means results["rank_%s" % key_name] = np.asarray(rankdata(arr, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates if self.iid: test_sample_counts = np.array(test_sample_counts[::n_tot_candidates], dtype=np.int) else: test_sample_counts = None _store('accuracy_score', test_accuracy, splits=True, rank=True, weights=test_sample_counts) _store('precision_score', test_precision, splits=True, rank=True, weights=test_sample_counts) _store('recall_score', test_recall, splits=True, rank=True, weights=test_sample_counts) _store('f1_score', test_f1, splits=True, rank=True, weights=test_sample_counts) _store('distance_mae', test_distance, splits=True, rank=True, weights=test_sample_counts, error=True) _store('count_mae', test_count, splits=True, rank=True, weights=test_sample_counts, error=True) _store('count_pct_mae', test_count_pct, splits=True, rank=True, weights=test_sample_counts, error=True) _store('raw_count_mae', test_raw_count, splits=True, rank=True, weights=test_sample_counts, error=True) _store('raw_count_pct_mae', test_raw_count_pct, splits=True, rank=True, weights=test_sample_counts, error=True) _store('density_mae', test_density, splits=True, rank=True, weights=test_sample_counts, error=True) _store('raw_density_mae', test_raw_density, splits=True, rank=True, weights=test_sample_counts, error=True) _store('fit_time', fit_time) _store('score_time', score_time) results['rank_custom'] = np.asarray(rankdata((results['rank_f1_score'] + results['rank_count_pct_mae']) / 2, method='min'), dtype=np.int32) best_index = np.flatnonzero(results['rank_custom'])[0] best_parameters = tot_candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_tot_candidates, ), mask=True, dtype=object)) for cand_i, params in enumerate(tot_candidate_params): params = merge_dicts(*params) for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = tot_candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: bp = best_parameters[0] bp.update(best_parameters[1]) # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**bp) best_estimator.fit(X, y) self.best_estimator_ = best_estimator return self
def _get_param_iterator(self): """ Return ParameterGrid instance for the given param_grid """ return ParameterGrid(self.param_grid)
def simulate(model, random_inputs, outputs, scenario_inputs=None, keep_random_inputs=False): '''Simulate model for one or more scenarios Parameters ---------- model : object User defined object containing the appropriate methods and properties for computing outputs from inputs random_intputs : dict of str to sequence of random variates Keys are stochastic input variable names and values are sequence of $n$ random variates, where $n$ is the number of simulation replications outputs : list of str List of output variable names scenario_inputs : optional (default is None), dict of str to sequence Keys are deterministic input variable names and values are sequence of values for each scenario for this variable. Is consumed by scikit-learn ParameterGrid() function. See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html keep_random_inputs : optional (default is False), boolean If True, all the random input variates are included in the results dataframe Returns ------- results_df : pandas DataFrame Values of all outputs for each simulation replication. If `scenario_inputs` is not None, then this is also for every combination of scenario inputs ''' # Clone the model model_clone = copy.deepcopy(model) # Update clone with random_inputs model_clone.update(random_inputs) # Store raw simulation input values if desired if keep_random_inputs: scenario_base_vals = vars(model_clone) else: scenario_base_vals = vars(model) # Initialize output counters and containers scenario_num = 0 scenario_results = [] # Check if multiple scenarios if scenario_inputs is not None: # Create parameter grid for scenario inputs sim_param_grid = list(ParameterGrid(scenario_inputs)) # Scenario loop for params in sim_param_grid: model_clone.update(params) # Initialize scenario related outputs result = {} scenario_vals = copy.copy(params) result['scenario_base_vals'] = scenario_base_vals result['scenario_num'] = scenario_num result['scenario_vals'] = scenario_vals raw_output = {} # Output measure loop for output_name in outputs: output_array = getattr(model_clone, output_name)() raw_output[output_name] = output_array # Gather results for this scenario result['output'] = raw_output scenario_results.append(result) scenario_num += 1 return scenario_results else: # Similar logic to above, but only a single scenario results = [] result = {} result['scenario_base_vals'] = scenario_base_vals result['scenario_num'] = scenario_num result['scenario_vals'] = {} raw_output = {} for output_name in outputs: output_array = getattr(model_clone, output_name)() raw_output[output_name] = output_array result['output'] = raw_output results.append(result) return results
def find_hparam_regression(estimator, X, y, param_grid, cv, scaler=None, n_jobs=1): """ Task: find the hyper-parameter from a set of parameters (param_grid), that performs best in an cross-validation setting for the given estimator. :param estimator: Estimator object, e.g. KernelRankSVC :param X: dictionary, (mol-id, system)-tuples as keys and molecular features as values: Example: {("M1", "S1"): feat_11, ...} :param y: dictionary, (mol-id, system)-tuples as keys and retention times as values Example: {("M1", "S1"): rt_11, ...} :param param_grid: dictionary, defining the grid-search space "C": Trade-of parameter for the SVM "gamma": width of the rbf/gaussian kernel ... etc. ... Example: {"C": [0.1, 1, 10], "gamma": [0.1, 0.25, 0.5, 1]} :param cv: cross-validation generator, see sklearn package, must be either a GroupKFold or GroupShuffleSplit object. :param scaler: scaler object, per feature scaler, e.g. MinMaxScaler :param n_jobs: int, number of jobs run in parallel. Parallelization is performed over the cv-folds. :return: dictionary, containing combination of best parameters Example: {"C": 1, "gamma": 0.25} dictionary, all parameter combinations with corresponding scores Example: [{"C": 1, "gamma": 0.25, "score": 0.98}, {"C": 1, "gamma": 0.50, "score": 0.94}, ...] scalar, number of pairs used to train the final model estimator object, fitted using the best parameters """ if not len(X) == len(X) or len(X.keys() - y.keys()) or len(y.keys() - X.keys()): raise ValueError( "Keys-set for features and retentions times must be equal.") if not isinstance(estimator, SVRPairwise): raise ValueError( "Currently parameters can only be estimated for the support vector regression " "class 'SVRPairwise'.") # Make a list of all combinations of parameters l_params = list(ParameterGrid(param_grid)) param_scores = np.zeros((len(l_params), )) # Get all (mol-id, system)-tuples used for the parameter search keys = list(X.keys()) mol_ids = list(zip(*keys))[0] cv_splits = cv.split(range(len(keys)), groups=mol_ids) # Precompute the training / test targets to save computation time as # we do not need to repeat this for several parameter settings. # cv_splits = cv.split (range (len (keys))) y_train_sets, y_test_sets = [], [] X_train_sets, X_test_sets = [], [] print("Get pairs for hparam estimation: ", end="", flush=True) for k_cv, (train_set, test_set) in enumerate(cv_splits): print("%d " % k_cv, end="", flush=True) # 0) Get keys (mol-id, system)-tuples, corresponding to the training # and test sets. keys_train = [keys[idx] for idx in train_set] keys_test = [keys[idx] for idx in test_set] # Check for overlap of molecular ids, e.g. InChIs. Between training and test # molecular ids should not be shared, e.g. if they appear in different systems # at the same time. mol_ids_train = [mol_ids[idx] for idx in train_set] mol_ids_test = [mol_ids[idx] for idx in test_set] if set(mol_ids_train) & set(mol_ids_test): if isinstance(cv, GroupKFold) or isinstance(cv, GroupShuffleSplit): raise RuntimeError( "As grouped cross-validation is used the training " "and test molecules, i.e. mol_ids, are not allowed " "to overlap. This can happen if molecular structures " "are appearing in different systems. During the " "learning of hyper-parameter the training set should " "not contain any structure also in the test set.", set(mol_ids_train) & set(mol_ids_test)) else: print("Training and test keys overlaps.", set(mol_ids_train) & set(mol_ids_test)) # 1) Extract the target values from y (train and test) using the keys y_train_sets.append(np.array([y[key] for key in keys_train])) y_test_sets.append(np.array([y[key] for key in keys_test])) # 2) Extract the features from X (train and test) using the keys X_train_sets.append(np.array([X[key] for key in keys_train])) X_test_sets.append(np.array([X[key] for key in keys_test])) print("") for k_param, param in enumerate(l_params): fold_scores = Parallel(n_jobs=n_jobs, verbose=False)( delayed(_fit_and_score_regression)( param, clone(estimator), X_train_sets[k_cv], X_test_sets[k_cv], y_train_sets[k_cv], y_test_sets[k_cv], scaler) for k_cv in range(cv.get_n_splits())) param_scores[k_param] = np.mean(fold_scores) ## Fit model using the best parameters # Find the best params best_params = l_params[np.argmax(param_scores)].copy() # Fit the model using the best parameters best_estimator = clone(estimator) best_estimator.set_params(**_filter_params(best_params, best_estimator)) X = np.array([X[key] for key in keys]) y = np.array([y[key] for key in keys]) if not scaler is None: X = scaler.transform(X) best_estimator.fit(X, y) # Combine the mean fold scores with the list of parameter sets for k_param, _ in enumerate(l_params): l_params[k_param]["score"] = param_scores[k_param] return best_params, l_params, -1, best_estimator
def find_hparan_ranksvm(estimator, X, y, param_grid, cv, pair_params, scaler=None, n_jobs=1, fold_score_aggregation="weighted_average", all_pairs_as_test=True): """ Task: find the hyper-parameter from a set of parameters (param_grid), that performs best in an cross-validation setting for the given estimator. :param estimator: Estimator object, e.g. KernelRankSVC :param X: dictionary, (mol-id, system)-tuples as keys and molecular features as values: Example: {("M1", "S1"): feat_11, ...} :param y: dictionary, (mol-id, system)-tuples as keys and retention times as values Example: {("M1", "S1"): rt_11, ...} :param param_grid: dictionary, defining the grid-search space "C": Trade-of parameter for the SVM "gamma": width of the rbf/gaussian kernel ... etc. ... Example: {"C": [0.1, 1, 10], "gamma": [0.1, 0.25, 0.5, 1]} :param cv: cross-validation generator, see sklearn package, must be either a GroupKFold or GroupShuffleSplit object. :param pair_params: dictionary, specifying parameters for the order graph: "ireverse": scalar, Should cross-system elution transitivity be included 0: no, 1: yes "d_lower": scalar, minimum distance of two molecules in the elution order graph to be considered as a pair. "d_upper": scalar, maximum distance of two molecules in the elution order graph to be considered as a pair. "allow_overlap": scalar, Should overlap between the upper and lower sets be allowed. Those overlaps originate from retention order contradictions between the different systems. :param scaler: scaler object, per feature scaler, e.g. MinMaxScaler :param n_jobs: integer, number of jobs run in parallel. Parallelization is performed over the cv-folds. (default = 1) :fold_score_aggregation: string, (default = "weighted_average") :all_pairs_as_test: boolean, should all possible pairs (d_lower = 0, d_upper = np.inf) be used during the test. If 'False' than corresponding values are taking from the 'pair_params' dictionary. (default = True) :return: dictionary, containing combination of best parameters Example: {"C": 1, "gamma": 0.25} dictionary, all parameter combinations with corresponding scores Example: [{"C": 1, "gamma": 0.25, "score": 0.98}, {"C": 1, "gamma": 0.50, "score": 0.94}, ...] scalar, number of pairs used to train the final model estimator object, fitted using the best parameters """ if not (isinstance(cv, GroupKFold) or isinstance(cv, GroupShuffleSplit)): raise ValueError("Cross-validation generator must be either of " "class 'GroupKFold' or 'GroupShuffleSplit'. " "Provided class is '%s'." % cv.__class__.__name__) if len(X) != len(y) or len(X.keys() - y.keys()) or len(y.keys() - X.keys()): raise ValueError("Keys-set for features and retentions times must " "be equal.") # Make a list of all combinations of parameters l_params = list(ParameterGrid(param_grid)) param_scores = np.zeros((len(l_params), )) # Get all (mol-id, system)-tuples used for the parameter search keys = list(X.keys()) if len(l_params) > 1: mol_ids = list(zip(*keys))[0] cv_splits = cv.split(range(len(keys)), groups=mol_ids) # Precompute the training / test pairs to save computation time as # we do not need to repeat this for several parameter settings. pairs_train_sets, pairs_test_sets = [], [] X_train_sets, X_test_sets = [], [] n_pairs_test_sets = [] print("Get pairs for hparam estimation: ", end="", flush=True) for k_cv, (train_set, test_set) in enumerate(cv_splits): print("%d " % k_cv, end="", flush=True) # 0) Get keys (mol-id, system)-tuples, corresponding to the training # and test sets. keys_train = [keys[idx] for idx in train_set] keys_test = [keys[idx] for idx in test_set] # Check for overlap of molecular ids, e.g. InChIs. Between training and test # molecular ids should not be shared, e.g. if they appear in different systems # at the same time. mol_ids_train = [mol_ids[idx] for idx in train_set] mol_ids_test = [mol_ids[idx] for idx in test_set] if set(mol_ids_train) & set(mol_ids_test): if isinstance(cv, GroupKFold) or isinstance( cv, GroupShuffleSplit): raise RuntimeError( "As grouped cross-validation is used the training " "and test molecules, i.e. mol_ids, are not allowed " "to overlap. This can happen if molecular structures " "are appearing in different systems. During the " "learning of hyper-parameter the training set should " "not contain any structure also in the test set.", set(mol_ids_train) & set(mol_ids_test)) else: print("Training and test keys overlaps.", set(mol_ids_train) & set(mol_ids_test)) # 1) Extract the target values from y (train and test) using the keys y_train, y_test = OrderedDict(), OrderedDict() for key in keys_train: y_train[key] = y[key] for key in keys_test: y_test[key] = y[key] # 2) Calculate the pairs (train and test) cretention_train, cretention_test = retention_cls(), retention_cls( ) # a) load 'lrows' in the retention_cls cretention_train.load_data_from_target(y_train) cretention_test.load_data_from_target(y_test) # b) build the digraph cretention_train.make_digraph(ireverse=pair_params["ireverse"]) cretention_test.make_digraph(ireverse=pair_params["ireverse"]) # c) find the upper and lower set cretention_train.dmolecules_inv = cretention_train.invert_dictionary( cretention_train.dmolecules) cretention_train.dcollections_inv = cretention_train.invert_dictionary( cretention_train.dcollections) cretention_test.dmolecules_inv = cretention_test.invert_dictionary( cretention_test.dmolecules) cretention_test.dcollections_inv = cretention_test.invert_dictionary( cretention_test.dcollections) # d) get the pairs from the upper and lower sets pairs_train = get_pairs_from_order_graph( cretention_train, keys_train, allow_overlap=pair_params["allow_overlap"], n_jobs=n_jobs, d_lower=pair_params["d_lower"], d_upper=pair_params["d_upper"]) pairs_train_sets.append(pairs_train) if all_pairs_as_test: pairs_test = get_pairs_from_order_graph( cretention_test, keys_test, allow_overlap=pair_params["allow_overlap"], n_jobs=n_jobs, d_lower=0, d_upper=np.inf) else: pairs_test = get_pairs_from_order_graph( cretention_test, keys_test, allow_overlap=pair_params["allow_overlap"], n_jobs=n_jobs, d_lower=pair_params["d_lower"], d_upper=pair_params["d_upper"]) pairs_test_sets.append(pairs_test) n_pairs_test_sets.append(len(pairs_test)) # 3) Extract the features from X (train and test) using the keys X_train_sets.append(np.array([X[key] for key in keys_train])) X_test_sets.append(np.array([X[key] for key in keys_test])) print("") for k_param, param in enumerate(l_params): # Calculate the absolute number of correctly classified pairs # for each fold. fold_scores = Parallel(n_jobs=n_jobs, verbose=False)( delayed(_fit_and_score_ranksvm)(param.copy(), clone( estimator), X_train_sets[k_cv], X_test_sets[k_cv], pairs_train_sets[k_cv], pairs_test_sets[k_cv], scaler) for k_cv in range(cv.get_n_splits())) if fold_score_aggregation == "average": param_scores[k_param] = np.mean(fold_scores / np.array(n_pairs_test_sets)) elif fold_score_aggregation == "weighted_average": param_scores[k_param] = np.sum(fold_scores) / np.sum( n_pairs_test_sets) else: raise ValueError("Invalid fold-scoring aggregation: %s." % fold_score_aggregation) ## Fit model using the best parameters # Find the best params best_params = l_params[np.argmax(param_scores)].copy() # Fit the model using the best parameters best_estimator = clone(estimator) best_estimator.set_params(**_filter_params(best_params, best_estimator)) # Build retention order graph cretention = retention_cls() cretention.load_data_from_target(y) cretention.make_digraph(ireverse=pair_params["ireverse"]) cretention.dmolecules_inv = cretention.invert_dictionary( cretention.dmolecules) cretention.dcollections_inv = cretention.invert_dictionary( cretention.dcollections) pairs = get_pairs_from_order_graph( cretention, keys, allow_overlap=pair_params["allow_overlap"], n_jobs=n_jobs, d_lower=pair_params["d_lower"], d_upper=pair_params["d_upper"]) n_pairs_train = len(pairs) X = np.array([X[key] for key in keys]) if scaler is not None: X = scaler.transform(X) fit_params = {"FX": X, "pairs": pairs} best_estimator.fit(None, y=None, fit_params=fit_params) # Combine the mean fold scores with the list of parameter sets for k_param, _ in enumerate(l_params): l_params[k_param]["score"] = param_scores[k_param] return best_params, l_params, n_pairs_train, best_estimator, X, None