Exemplo n.º 1
0
        for i_split, (train_set,
                      _) in enumerate(rd_split.split(range(len(rts)))):
            print("Process split: %d/%d." %
                  (i_split + 1, rd_split.get_n_splits()))

            n_spectra.append(len(train_set))

            # Shuffle split does not preserve the order of the examples when sub-setting.
            train_set = np.sort(train_set)

            rts_train = rts[train_set]
            wtx_train = wtx[train_set]

            pairs, _ = get_pairs_single_system(rts_train,
                                               d_lower=0,
                                               d_upper=np.inf,
                                               return_rt_differences=True)

            # Calculate the pairwise accuracy
            score = 0.0
            for i, j in pairs:
                if wtx_train[i] < wtx_train[j]:
                    score += 1.0
            if len(pairs) > 0:
                score /= len(pairs)

            print("Kendall tau=%f, Spearmanr=%f, pairwise acc=%f" %
                  (sp.stats.kendalltau(wtx_train, rts_train)[0],
                   sp.stats.spearmanr(wtx_train, rts_train)[0], score))

            # 3) Perform the reranking of the candidates
Exemplo n.º 2
0
    def test_equal_to_simple_function_in_single_system_case(self):
        cretention = retention_cls()

        # ----------------------------------------------
        d_target = OrderedDict([(("M1", "A"), 10), (("M2", "A"), 4),
                                (("M3", "A"), 6), (("M4", "A"), 8),
                                (("M5", "A"), 2)])
        keys = list(d_target.keys())

        cretention.load_data_from_target(d_target)
        cretention.make_digraph()
        cretention.dmolecules_inv = cretention.invert_dictionary(
            cretention.dmolecules)
        cretention.dcollections_inv = cretention.invert_dictionary(
            cretention.dcollections)

        d_pairs_ref = {
            0: [],
            1: [(4, 1), (1, 2), (2, 3), (3, 0)],
            2: [(4, 1), (1, 2), (2, 3), (3, 0), (4, 2), (1, 3), (2, 0)],
            3: [(4, 1), (1, 2), (2, 3), (3, 0), (4, 2), (1, 3), (2, 0), (4, 3),
                (1, 0)],
            4: [(4, 1), (1, 2), (2, 3), (3, 0), (4, 2), (1, 3), (2, 0), (4, 3),
                (1, 0), (4, 0)]
        }

        for d in d_pairs_ref.keys():
            pairs_og = get_pairs_from_order_graph(cretention,
                                                  keys,
                                                  allow_overlap=True,
                                                  d_lower=0,
                                                  d_upper=d)
            pairs = get_pairs_single_system(list(d_target.values()),
                                            d_lower=0,
                                            d_upper=d)

            self.assertEqual(len(pairs_og), len(d_pairs_ref[d]))
            self.assertEqual(len(pairs), len(d_pairs_ref[d]))

            for pair in d_pairs_ref[d]:
                self.assertIn(pair, pairs_og)
                self.assertIn(pair, pairs)

        # ----------------------------------------------
        d_target = OrderedDict([(("M1", "A"), 10), (("M2", "A"), 4),
                                (("M3", "A"), 6), (("M4", "A"), 8),
                                (("M5", "A"), 2)])
        keys = list(d_target.keys())

        cretention.load_data_from_target(d_target)
        cretention.make_digraph()
        cretention.dmolecules_inv = cretention.invert_dictionary(
            cretention.dmolecules)
        cretention.dcollections_inv = cretention.invert_dictionary(
            cretention.dcollections)

        d_pairs_ref = {
            5: [],
            4: [(4, 0)],
            3: [(4, 0), (4, 3), (1, 0)],
            2: [(4, 0), (4, 3), (1, 0), (4, 2), (1, 3), (2, 0)],
            1: [(4, 0), (4, 3), (1, 0), (4, 2), (1, 3), (2, 0), (4, 1), (1, 2),
                (2, 3), (3, 0)],
            0: [(4, 0), (4, 3), (1, 0), (4, 2), (1, 3), (2, 0), (4, 1), (1, 2),
                (2, 3), (3, 0)]
        }

        for d in d_pairs_ref.keys():
            pairs_og = get_pairs_from_order_graph(cretention,
                                                  keys,
                                                  allow_overlap=True,
                                                  d_lower=d,
                                                  d_upper=np.inf)
            pairs = get_pairs_single_system(list(d_target.values()),
                                            d_lower=d,
                                            d_upper=np.inf)

            self.assertEqual(len(pairs_og), len(d_pairs_ref[d]))
            self.assertEqual(len(pairs), len(d_pairs_ref[d]))

            for pair in d_pairs_ref[d]:
                self.assertIn(pair, pairs_og)
                self.assertIn(pair, pairs)
Exemplo n.º 3
0
def evaluate_on_target_systems(target_systems,
                               training_systems,
                               predictor,
                               pair_params,
                               kernel_params,
                               opt_params,
                               input_dir,
                               estimator,
                               feature_type,
                               n_jobs=1,
                               perc_for_training=100):
    """
    Task: Evaluate rank-correlation, accuracy, etc. by learning an order predictor using the given
          set of training systems and prediction on the given set of target systems.

          For the evaluation we use either a repeated random-split of the target systems' data
          (if less than 75 examples are provided for test) or a cross-validation (else). The
          hyper-paramters of the order predictor are optimized using a nested cross-validation.
          The routines for that can be found in the file 'model_selection_cls.py'.

          If desired (excl_mol_by_struct_only == True), the molecular structures from the test set
          are removed from the training based on their molecular structure, e.g. by comparison of
          their InChIs, _even_ if these structures have been measured with another than the
          target system, i.e., another chromatographic system.

          See also the paper for details on the evaluation strategy.

    :param target_systems: list of strings, containing the target systems

    :param training_systems: list of strings, containing the training systems

    :param predictor: list of string, containing the predictors / molecular features used for the
        model construction.

    :param pair_params: dictionary, containing the paramters used for the creation of
        the RankSVM learning pairs, e.g. minimum and maximum oder distance.

    :param kernel_params: dictionary, containing the parameters for the kernels and
        generally for handling the input features / predictors. See definition of the
        dictionary in the __main__ of file 'evaluation_scenario_cls.py'.

    :param opt_params: dictionary, containing the paramters controlling the hyper-paramter
        optimization, number of cross-validation splits, etc. See definition of the
        dictionary in the __main__ of file 'evaluation_scenario_cls.py'.

    :param input_dir: string, directory containing the input data, e.g., fingerprints and retention
        times.

    :param estimator: string, order predictor to use: either "ranksvm" or "svr".

    :param feature_type: string, feature type that is used for the RankSVM. Currently
        only 'difference' features are supported, i.e., \phi_j - \phi_i is used for
        the decision. If the estimator is not RankSVM, but e.g. Support Vector Regression,
        than tis parameter can be set to None and is ignored.

    :param n_jobs: integer, number of jobs used for the hyper-parameter estimation. The maximum number
        of used jobs, is the number of inner splits (cross-validation or random split)!

    :param perc_for_training: scalar, percentage of the target systems data, that is
        used for the training, e.g., selected by simple random sub-sampling. This value
        only effects the training process, of the target system is in the set of training
        systems.

    :return: tuple of pandas.DataFrame

        1) mapped_values: predicted order scores for each target system
            - corresponds to: w^\phi_i in the RankSVM case
            - corresponds to: the predicted retention time, in the SVR case
        2) correlations: rank correlations of the order scores for each target system
        3) accuracies: pairwise prediction accuracies for each target system
        4) simple_statistics: number of training and test examples, etc.
        5) grid_search_results: hyper-parameter scores for the different grid-parameters
        6) grid_search_best_params: hyper-parameter scores for the best grid-parameters

        NOTE: The returned results (except mapped_values and grid search results) are averages
              across the different random splits / crossvalidation folds and repetitions.
    """

    # Variables related to the number of random / cv splits, for inner (*_cv)
    # and outer fold (*_ncv).
    n_splits_shuffle = opt_params["n_splits_shuffle"]
    n_splits_nshuffle = opt_params["n_splits_nshuffle"]
    n_splits_cv = opt_params["n_splits_cv"]
    n_splits_ncv = opt_params["n_splits_ncv"]
    n_rep = opt_params["n_rep"]

    # Should molecules be excluded from the training, if their structure appears
    # in the test _even if_ they have been measured with another system than the
    # (current) target system:
    excl_mol_by_struct_only = opt_params["excl_mol_by_struct_only"]

    # Currently only 'slack_type == "on_pairs"' is supported.
    slack_type = opt_params["slack_type"]
    if slack_type != "on_pairs":
        raise ValueError("Invalid slack type: %s" % slack_type)

    # Should all possible pairs be used for the (inner) test split during the
    # parameter estimation, regardless of what are the settings for 'd_upper'
    # and 'd_lower'?
    all_pairs_for_test = opt_params["all_pairs_for_test"]

    if not estimator in ["ranksvm", "svr"]:
        raise ValueError("Invalid estimator: %s" % estimator)

    # RankSVM and SVR regularization parameter
    param_grid = {"C": opt_params["C"]}

    if estimator == "svr":
        # error-tube width of the SVR
        param_grid["epsilon"] = opt_params["epsilon"]

    # Molecule kernel
    if kernel_params["kernel"] == "linear":
        kernel = "linear"
    elif kernel_params["kernel"] in ["rbf", "gaussian"]:
        param_grid["gamma"] = kernel_params["gamma"]
        kernel = "rbf"
    elif kernel_params["kernel"] == "tanimoto":
        if estimator in ["ranksvm"]:
            kernel = tanimoto_kernel
        elif estimator in ["svr"]:
            kernel = tanimoto_kernel_mat
    elif kernel_params["kernel"] == "minmax":
        if estimator in ["ranksvm"]:
            kernel = minmax_kernel
        elif estimator in ["svr"]:
            kernel = minmax_kernel_mat
    else:
        raise ValueError("Invalid kernel: %s." % kernel_params["kernel"])

    if isinstance(target_systems, str):
        target_systems = [target_systems]
    if isinstance(training_systems, str):
        training_systems = [training_systems]
    all_systems = list(set(target_systems).union(training_systems))

    assert isinstance(target_systems, list) and isinstance(
        training_systems, list)

    n_target_systems = len(target_systems)
    n_training_systems = len(training_systems)

    print("Target systems (# = %d): %s" %
          (n_target_systems, ",".join(target_systems)))
    print("Training systems (# = %d): %s" %
          (n_training_systems, ",".join(training_systems)))

    ## Load the target and training systems into directories using (molecule, system)-keys
    ## and retention times respectively molecular features as values

    # If we use molecular descriptors, we need to scale the data, e.g. to [0, 1].
    if kernel_params["scaler"] == "noscaling":
        scaler = None
    elif kernel_params["scaler"] == "minmax":
        scaler = MinMaxScaler()
    elif kernel_params["scaler"] == "std":
        scaler = StandardScaler()
    elif kernel_params["scaler"] == "l2norm":
        scaler = Normalizer()
    else:
        raise ValueError("Invalid scaler for the molecular features: %s" %
                         kernel_params["scaler"])

    # Handle counting MACCS fingerprints
    if predictor[0] == "maccsCount_f2dcf0b3":
        predictor_c = ["maccs"]
        predictor_fn = "fps_maccs_count.csv"
    else:
        predictor_c = predictor
        predictor_fn = None

    d_rts, d_features, d_system_index = OrderedDict(), OrderedDict(
    ), OrderedDict()
    for k_sys, system in enumerate(all_systems):
        rts, data = load_data(input_dir,
                              system=system,
                              predictor=predictor_c,
                              pred_fn=predictor_fn)

        # Use (mol-id, system)-tupel as key
        keys = list(zip(rts.inchi.values, [system] * rts.shape[0]))

        # Values: retention time, features
        rts = rts.rt.values.reshape(-1, 1)
        data = data.drop("inchi", axis=1).values

        if kernel_params["poly_feature_exp"]:
            # If we use binary fingerprints, we can include some
            # interactions, e.g. x_1x_2, ...
            data = PolynomialFeatures(interaction_only=True,
                                      include_bias=False).fit_transform(data)

        # Make ordered directories
        d_rts[system], d_features[system] = OrderedDict(), OrderedDict()

        for i, key in enumerate(keys):
            d_rts[system][key] = rts[i, 0]
            d_features[system][key] = data[i, :]

        # Dictionary containing a unique numeric identifier for each system
        d_system_index[system] = k_sys

        if scaler is not None:
            if getattr(scaler, "partial_fit", None) is not None:
                # 'partial_fit' allows us to learn the parameters of the scaler
                # online. (great stuff :))
                scaler.partial_fit(data)
            else:
                # We have scaler at hand, that does not allow online fitting.
                # This probably means, that this is a scaler, that performs
                # the desired scaling for each example independently, e.g.
                # sklearn.preprocessing.Normalizer.
                pass

    for system in target_systems:
        print("Target set '%s' contains %d examples." %
              (system, len(d_rts[system])))

    # Collect all the data that is available for training.
    d_rts_training = join_dicts(d_rts, training_systems)
    d_features_training = join_dicts(d_features, training_systems)

    # (mol-id, system)-tuples used in the training set
    l_keys_training = list(d_features_training.keys())

    # Data frames storing the evaluation measures
    mapped_values = {
        target_system: DataFrame()
        for target_system in target_systems
    }
    accuracies, correlations, simple_statistics = DataFrame(), DataFrame(
    ), DataFrame()
    grid_search_results, grid_search_best_params = DataFrame(), DataFrame()

    for idx_system, target_system in enumerate(target_systems):
        print("Process target system: %s (%d/%d)." %
              (target_system, idx_system + 1, len(target_systems)))

        # (mol-id, system)-tuples in the target set
        l_keys_target = list(d_features[target_system].keys())

        for i_rep in range(n_rep):
            print("Repetition: %d/%d" % (i_rep + 1, n_rep))

            # Get a random subset of the training data
            l_keys_training_sub = sample_perc_from_list(l_keys_training,
                                                        tsystem=target_system,
                                                        perc=perc_for_training,
                                                        random_state=747 *
                                                        i_rep)
            print("Training set contains %d (%f%%) examples." %
                  (len(l_keys_training_sub),
                   100 * len(l_keys_training_sub) / len(l_keys_training)))
            for training_system in training_systems:
                n_train_sys_sub = sum(
                    np.array(list(zip(
                        *l_keys_training_sub))[1]) == training_system)
                n_train_sys = sum(
                    np.array(list(zip(
                        *l_keys_training))[1]) == training_system)
                print("\tSystem %s contributes %d (%f%%) examples." %
                      (training_system, n_train_sys_sub,
                       100 * n_train_sys_sub / n_train_sys))

            # Check whether the target system has any overlap with training system
            print("Outer validation split strategy: ", end="", flush=True)

            l_molids_training = list(zip(*l_keys_training_sub))[0]
            l_molids_target = list(zip(*l_keys_target))[0]

            if (excl_mol_by_struct_only and (len (set (l_molids_training) & set (l_molids_target)) == 0)) or \
                (not excl_mol_by_struct_only and (len (set (l_keys_training_sub) & set (l_keys_target)) == 0)):

                print(
                    "Predefined split:\n"
                    "\tTraining and target do not share molecular structures "
                    "(excl_mol_by_struct_only=%d)" % excl_mol_by_struct_only)
                cv_outer = PredefinedSplit(np.zeros(len(l_keys_target)))

            else:
                # Determine strategy for training / test splits
                if len(l_keys_target) < 75:
                    print("ShuffleSplit")
                    train_size = 0.75
                    cv_outer = ShuffleSplit(n_splits=n_splits_shuffle,
                                            train_size=train_size,
                                            test_size=(1 - train_size),
                                            random_state=320 * i_rep)
                else:
                    print("KFold")
                    cv_outer = KFold(n_splits=n_splits_cv,
                                     shuffle=True,
                                     random_state=320 * i_rep)

            # Performance evaluation using cross-validation / random splits
            for i_fold, (_,
                         test_set) in enumerate(cv_outer.split(l_keys_target)):
                print("Outer fold: %d/%d" %
                      (i_fold + 1, cv_outer.get_n_splits()))

                # (mol-id, system)-tuples in the test subset of the target set
                l_keys_target_test = [l_keys_target[idx] for idx in test_set]

                # Remove test subset of the target set from the training set.
                # NOTE: The training set might contain the whole target set.
                l_molids_target_test = list(zip(*l_keys_target_test))[0]
                if excl_mol_by_struct_only:
                    l_keys_training_train = [
                        key for key in l_keys_training_sub
                        if key[0] not in l_molids_target_test
                    ]
                else:
                    l_keys_training_train = [
                        key for key in l_keys_training_sub
                        if key not in l_keys_target_test
                    ]

                if isinstance(cv_outer, PredefinedSplit):
                    print("Shuffle pre-defined split.")

                    rs_old = np.random.get_state()
                    np.random.seed(320 * i_fold)

                    # If we use the pre-defined splits we need to shuffle by our self.
                    # In that way we prevent bias during the h-param estimation.
                    np.random.shuffle(
                        l_keys_training_train)  # Shuffle is done inplace

                    np.random.set_state(rs_old)

                l_molids_training_train = list(zip(*l_keys_training_train))[0]

                if excl_mol_by_struct_only:
                    assert (len(
                        set(l_molids_target_test)
                        & set(l_molids_training_train)) == 0)
                else:
                    assert (len(
                        set(l_keys_target_test)
                        & set(l_keys_training_train)) == 0)

                # Determine strategy for training / test splits (inner)
                print("Inner (h-param) validation split strategy: ",
                      end="",
                      flush=True)
                if len(l_keys_training_train) < 75:
                    print("GroupShuffleSplit")
                    train_size = 0.75
                    cv_inner = GroupShuffleSplit(n_splits=n_splits_nshuffle,
                                                 train_size=train_size,
                                                 test_size=(1 - train_size),
                                                 random_state=350 * i_fold *
                                                 i_rep)
                else:
                    print("GroupKFold")
                    cv_inner = GroupKFold(n_splits=n_splits_ncv)

                # Train the rankSVM: Find optimal set of hyper-parameters
                od_rts_training_train, od_features_training_train = OrderedDict(
                ), OrderedDict()
                for key in l_keys_training_train:
                    od_rts_training_train[key] = d_rts_training[key]
                    od_features_training_train[key] = d_features_training[key]

                start_time = time.time()

                if estimator == "ranksvm":
                    best_params, cv_results, n_train_pairs, ranking_model, _, _ = find_hparan_ranksvm(
                        estimator=KernelRankSVC(kernel=kernel,
                                                slack_type=slack_type,
                                                random_state=319 * i_fold *
                                                i_rep),
                        fold_score_aggregation="weighted_average",
                        X=od_features_training_train,
                        y=od_rts_training_train,
                        param_grid=param_grid,
                        cv=cv_inner,
                        pair_params=pair_params,
                        n_jobs=n_jobs,
                        scaler=scaler,
                        all_pairs_as_test=all_pairs_for_test)
                elif estimator == "svr":
                    best_params, cv_results, n_train_pairs, ranking_model = find_hparam_regression(
                        estimator=SVRPairwise(kernel=kernel),
                        X=od_features_training_train,
                        y=od_rts_training_train,
                        param_grid=param_grid,
                        cv=cv_inner,
                        n_jobs=n_jobs,
                        scaler=scaler)
                else:
                    raise ValueError("Invalid estimator: %s" % estimator)

                rtime_gcv = time.time() - start_time
                print("[find_hparam_*] %.3fsec" % rtime_gcv)

                # Store the grid-search statistics for further analyses
                grid_search_results_tmp = DataFrame(cv_results)
                grid_search_results_tmp["target_system"] = target_system
                grid_search_results_tmp["training_systems"] = ";".join(
                    training_systems)
                grid_search_results = grid_search_results.append(
                    grid_search_results_tmp)

                grid_search_best_params_tmp = DataFrame([best_params])
                grid_search_best_params_tmp["target_system"] = target_system
                grid_search_best_params_tmp["training_systems"] = ";".join(
                    training_systems)
                grid_search_best_params = grid_search_best_params.append(
                    grid_search_best_params_tmp)

                print(grid_search_best_params_tmp)

                ## Do prediction for the test set
                # Calculate: w' * \phi(x_i), for all molecules i
                X_test, rts_test = [], []

                for key in l_keys_target_test:
                    rts_test.append(d_rts[target_system][key])
                    X_test.append(d_features[target_system][key])

                rts_test = np.array(rts_test).reshape(-1, 1)
                X_test = np.array(X_test)

                if scaler is not None:
                    X_test = scaler.transform(X_test)

                if estimator == "ranksvm":
                    Y_pred_test = ranking_model.predict(X_test, X_test)
                elif estimator == "svr":
                    Y_pred_test = ranking_model.predict(X_test)
                else:
                    raise ValueError("Invalid estimator: %s" % estimator)

                wTx = ranking_model.map_values(X_test)

                mapped_values[target_system] = pd.concat([
                    mapped_values[target_system],
                    DataFrame({
                        "mapped_value": wTx,
                        "true_rt": rts_test.flatten(),
                        "inchi": l_molids_target_test
                    })
                ],
                                                         ignore_index=True)

                correlations = correlations.append(
                    {
                        "rank_corr": sp.stats.kendalltau(wTx, rts_test)[0],
                        "spear_corr": sp.stats.spearmanr(wTx, rts_test)[0],
                        "target_system": target_system,
                        "training_system": ";".join(training_systems)
                    },
                    ignore_index=True)

                n_train_mol = len(set(l_molids_training_train))
                n_test_mol = len(set(l_molids_target_test))
                n_shared_mol = len(
                    set(l_molids_target_test) & (set(l_molids_training_train)))
                p_shared_mol = float(n_shared_mol) / n_test_mol

                # Predict: x_i > x_j or x_i < x_j for all molecule pairs (i, j)
                with Timer("Get prediction score"):
                    for d_lower, d_upper in itertools.product(
                        [0] + list(range(1, 15, 2)),
                            2**np.array([0, 1, 2, 3, 4, 5, 6, np.inf])):
                        if d_lower > d_upper:
                            continue

                        pairs_test = get_pairs_single_system(rts_test,
                                                             d_lower=d_lower,
                                                             d_upper=d_upper)

                        accuracies = accuracies.append(
                            {
                                "score_w":
                                ranking_model.score_using_prediction(
                                    Y_pred_test, pairs_test, normalize=False),
                                "score":
                                ranking_model.score_using_prediction(
                                    Y_pred_test, pairs_test),
                                "n_pairs_test":
                                len(pairs_test),
                                "target_system":
                                target_system,
                                "training_system":
                                ";".join(training_systems),
                                "d_lower":
                                d_lower,
                                "d_upper":
                                d_upper,
                                "i_rep":
                                i_rep
                            },
                            ignore_index=True)

                        # Write out how many molecular structures are shared between the target and training systems
                        n_test_pairs = len(pairs_test)
                        simple_statistics = simple_statistics.append(
                            {
                                "n_shared_mol": n_shared_mol,
                                "p_shared_mol": p_shared_mol,
                                "n_train_mol": n_train_mol,
                                "n_test_mol": n_test_mol,
                                "n_train_pairs": n_train_pairs,
                                "n_test_pairs": n_test_pairs,
                                "grid_search_time": rtime_gcv,
                                "target_system": target_system,
                                "training_systems": ";".join(training_systems),
                                "d_lower": d_lower,
                                "d_upper": d_upper
                            },
                            ignore_index=True)

    # Average the mapped values over the repetitions
    for target_system in target_systems:
        mapped_values[target_system]["mapped_value_std"] = mapped_values[
            target_system]["mapped_value"]
        mapped_values[target_system] = mapped_values[target_system].groupby(
            ["inchi"], as_index=False).agg({
                "mapped_value": np.mean,
                "mapped_value_std": np.std,
                "true_rt": np.unique
            })

    # Aggregate the rows in 'correlations' to get the mean- and std-values across the folds.
    correlations["rank_corr_std"] = correlations["rank_corr"]
    correlations["spear_corr_std"] = correlations["spear_corr"]
    correlations = correlations.groupby(["target_system", "training_system"],
                                        as_index=False).agg({
                                            "rank_corr":
                                            np.mean,
                                            "rank_corr_std":
                                            np.std,
                                            "spear_corr":
                                            np.mean,
                                            "spear_corr_std":
                                            np.std
                                        })

    # Aggregate the rows in 'accuracies' to get the expected pairwise accuracy
    accuracies = accuracies.groupby(
        ["target_system", "training_system", "d_lower", "d_upper", "i_rep"],
        as_index=False).agg({
            "score_w": np.sum,
            "n_pairs_test": np.sum,
            "score": np.mean
        })
    accuracies["score_w"] = accuracies["score_w"] / accuracies["n_pairs_test"]
    accuracies.drop(["i_rep", "n_pairs_test"], axis=1, inplace=True)

    # Calculate expected accuracy across the repetitions
    accuracies["score_w_std"] = accuracies["score_w"]
    accuracies["score_std"] = accuracies["score"]
    accuracies = accuracies.groupby(
        ["target_system", "training_system", "d_lower", "d_upper"],
        as_index=False).agg({
            "score_w": np.mean,
            "score_w_std": np.std,
            "score": np.mean,
            "score_std": np.std
        })

    # Aggregate the simple statistics
    simple_statistics["n_shared_mol_std"] = simple_statistics["n_shared_mol"]
    simple_statistics["p_shared_mol_std"] = simple_statistics["p_shared_mol"]
    simple_statistics["n_train_mol_std"] = simple_statistics["n_train_mol"]
    simple_statistics["n_test_mol_std"] = simple_statistics["n_test_mol"]
    simple_statistics["n_train_pairs_std"] = simple_statistics["n_train_pairs"]
    simple_statistics["n_test_pairs_std"] = simple_statistics["n_test_pairs"]
    simple_statistics["grid_search_time_std"] = simple_statistics[
        "n_test_pairs"]

    simple_statistics = simple_statistics.groupby(
        ["target_system", "training_systems", "d_lower", "d_upper"],
        as_index=False).agg({
            "n_shared_mol": np.mean,
            "p_shared_mol": np.mean,
            "n_train_mol": np.mean,
            "n_test_mol": np.mean,
            "n_train_pairs": np.mean,
            "n_test_pairs": np.mean,
            "grid_search_time": np.mean,
            "n_shared_mol_std": np.std,
            "p_shared_mol_std": np.std,
            "n_train_mol_std": np.std,
            "n_test_mol_std": np.std,
            "n_train_pairs_std": np.std,
            "n_test_pairs_std": np.std,
            "grid_search_time_std": np.std
        })

    return mapped_values, correlations, accuracies, simple_statistics, grid_search_results, grid_search_best_params
def single_dataset(X,
                   target,
                   kernel,
                   convergence_criteria="alpha_change_max",
                   t_0=0.5,
                   tol=0.001,
                   slack_type="on_pairs",
                   fig_fn=None,
                   C=1,
                   step_size_algorithm="diminishing_2"):
    fig, axes = plt.subplots(2, 3)
    fig.suptitle(
        dict2str(
            {
                "convergence_criteria": convergence_criteria,
                "step_size_algorithm": step_size_algorithm,
                "slack_type": slack_type
            },
            sep=" ; "))
    if fig_fn is not None:
        fig.set_size_inches(14, 8)

    if X.shape[1] == 2:
        visualize_dataset2(X[:, 0],
                           X[:, 1],
                           target,
                           axes[0, 0],
                           title="Dataset: %s" % type)
    else:
        # Do some low dimensional embedding, e.g. given a set of fingerprints.
        pass

    # Get training and test split
    train_set, test_set = list(
        ShuffleSplit(n_splits=1,
                     train_size=0.75,
                     test_size=0.25,
                     random_state=666).split(X))[0]

    pairs_train = get_pairs_single_system(target[train_set],
                                          d_lower=0,
                                          d_upper=16)
    pairs_train_full = get_pairs_single_system(target[train_set],
                                               d_lower=0,
                                               d_upper=np.inf)
    pairs_test = get_pairs_single_system(target[test_set],
                                         d_lower=0,
                                         d_upper=np.inf)

    alpha = np.zeros((len(pairs_train), 1))
    t_0_ = t_0
    k = 1
    max_iter = 25
    n_steps = 80
    f_0s = np.array([])  # primal objective values during optimization
    gs = np.array([])  # dual objective values during optimization
    dgs = np.array([])  # duality gap
    rdgs = np.array([])
    score_train = []
    score_test = []
    rank_corr_train = []
    rank_corr_test = []
    x_space = []

    for ii in range(n_steps):
        print("%d: " % ii, end="", flush=True)
        ranksvm = KernelRankSVC(C=C,
                                debug=1,
                                kernel=kernel,
                                slack_type=slack_type,
                                t_0=t_0_,
                                convergence_criteria=convergence_criteria,
                                max_iter=k + (max_iter - 1),
                                step_size_algorithm=step_size_algorithm,
                                random_state=101,
                                degree=2,
                                gamma=0.5,
                                tol=tol,
                                verbose=True)

        # Set stepsize to latest stepsize
        ranksvm.t_0 = t_0_
        ranksvm.fit(None,
                    None,
                    fit_params={
                        "FX": X[train_set],
                        "pairs": pairs_train,
                        "alpha_init": alpha,
                        "k_init": k
                    })
        # Store the alpha as initial value for the next round
        alpha = ranksvm.alpha.reshape((-1, 1))

        # Get last stepsize
        print("Stepsize: t_0 = %f, t_conv = %f; Iteration: k = %d" %
              (t_0, ranksvm._t_convergence, ranksvm._k_convergence))

        if step_size_algorithm == "diminishing_2":
            t_0_ = ranksvm._get_step_size_diminishing_2(ranksvm._t_convergence)
        if ranksvm._obj_has_converged:
            break
        else:
            k = ranksvm._k_convergence + 1

        # Get internal optimization results
        f_0s = np.concatenate((f_0s, np.array(ranksvm.f_0s).flatten()))
        gs = np.concatenate((gs, np.array(ranksvm.gs).flatten()))
        dgs = np.concatenate((dgs, np.array(ranksvm.dgs).flatten()))
        rdgs = np.concatenate((rdgs, np.array(ranksvm.rdgs).flatten()))

        # Get pairwise score and rank_correlation (train set)
        score_train.append(ranksvm.score(X[train_set], pairs_train_full))
        rank_corr_train.append(
            sp.stats.kendalltau(ranksvm.map_values(X[train_set]),
                                target[train_set])[0])

        # Get pairwise score and rank_correlation (test set)
        score_test.append(ranksvm.score(X[test_set], pairs_test))
        rank_corr_test.append(
            sp.stats.kendalltau(ranksvm.map_values(X[test_set]),
                                target[test_set])[0])

        x_space.append(ranksvm._k_convergence)

    print("Iterations: %d" % ranksvm._k_convergence)

    # Run RankSVM optimization until convergence
    ranksvm = KernelRankSVC(C=C,
                            verbose=True,
                            kernel=kernel,
                            slack_type=slack_type,
                            t_0=t_0,
                            convergence_criteria=convergence_criteria,
                            step_size_algorithm=step_size_algorithm,
                            degree=2,
                            gamma=0.5,
                            tol=tol,
                            max_iter=max_iter * n_steps,
                            random_state=101)
    ranksvm.fit(None,
                None,
                fit_params={
                    "FX": X[train_set],
                    "pairs": pairs_train
                })
    visualize_ranksvm(target[test_set],
                      ranksvm.map_values(X[test_set]),
                      axes[0, 1],
                      title="Iter: %d (max = %d)" %
                      (ranksvm._k_convergence, ranksvm.max_iter))

    rc_train_line = axes[0, 2].plot(x_space,
                                    rank_corr_train,
                                    color="blue",
                                    linestyle="-")
    rc_test_line = axes[0, 2].plot(x_space,
                                   rank_corr_test,
                                   color="red",
                                   linestyle="-")
    axes[0, 2].set_xlabel("Iteration")
    axes[0, 2].set_ylabel("Rank-correlation")
    axes[0, 2].set_title("C = %.3f" % ranksvm.C)
    axes[0, 2].grid(True)
    axes[0, 2].legend((rc_train_line[0], rc_test_line[0]), ("Train", "Test"))

    sc_train_line = axes[1, 0].plot(x_space,
                                    score_train,
                                    color="blue",
                                    linestyle="-")
    sc_test_line = axes[1, 0].plot(x_space,
                                   score_test,
                                   color="red",
                                   linestyle="-")
    axes[1, 0].set_xlabel("Iteration")
    axes[1, 0].set_ylabel("Pairwise accuracy")
    axes[1, 0].set_title("C = %.3f" % ranksvm.C)
    axes[1, 0].grid(True)
    axes[1, 0].legend((sc_train_line[0], sc_test_line[0]), ("Train", "Test"))

    # Primal and dual objective
    f_0_line = axes[1, 1].semilogy(f_0s, color="blue", linestyle="-")
    g_line = axes[1, 1].semilogy(gs, color="red", linestyle="-")
    axes[1, 1].grid(True)
    axes[1, 1].set_xlabel("Iteration")
    axes[1, 1].set_ylabel("Objective value")
    axes[1, 1].legend((f_0_line[0], g_line[0]), (
        "Primal",
        "Dual",
    ),
                      title="Objectives")

    # Duality gap
    dgs_line = axes[1, 2].semilogy(dgs, "green", linestyle="-")
    rdgs_line = axes[1, 2].semilogy(rdgs, "red", linestyle="-")
    axes[1, 2].set_xlabel("Iteration")
    axes[1, 2].set_ylabel("Duality gap")
    axes[1, 2].grid(True)
    axes[1, 2].legend((dgs_line[0], rdgs_line[0]), (
        "Absolute",
        "Relative",
    ),
                      title="Objectives")

    if fig_fn is not None:
        plt.savefig(fig_fn)
    else:
        plt.show()
def compare_datasets2(fig_fn=None):
    fig, axes = plt.subplots(3, 2)

    if not fig_fn is None:
        fig.set_size_inches(12, 8)

    # for k, type in enumerate (["linear", "quadratic", "open_circle"]):
    for k, type in enumerate(["quadratic"]):
        print("Type: %s" % type)

        X, target, d_X, d_target = create_artificial_dataset2(type=type, n=150)
        keys = list(d_X.keys())

        visualize_dataset2(X[:, 0],
                           X[:, 1],
                           target,
                           axes[k, 0],
                           title="Dataset: %s" % type)

        target_pred = np.zeros(len(X))
        mean_score = 0.0

        param_grid = {"C": [1]}

        cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
        for k_cv, (train_set, test_set) in enumerate(cv.split(keys)):
            print("Fold %d / %d" % (k_cv + 1, cv.get_n_splits()))

            keys_train = [keys[idx] for idx in train_set]
            keys_test = [keys[idx] for idx in test_set]

            d_X_train, d_target_train = OrderedDict(), OrderedDict()
            for key in keys_train:
                d_X_train[key] = d_X[key]
                d_target_train[key] = d_target[key]

            # d_X_train = {key: value for key, value in d_X.items() if key in keys_train}
            # d_target_train = {key: value for key, value in d_target.items() if key in keys_train}

            if type == "linear":
                ranksvm_kernel = KernelRankSVC(
                    verbose=False,
                    kernel="linear",
                    feature_type="difference",
                    slack_type="on_pairs",
                    step_size_algorithm="diminishing_2",
                    convergence_criteria="alpha_change_norm")
            elif type == "quadratic":
                ranksvm_kernel = KernelRankSVC(verbose=False,
                                               kernel="poly",
                                               feature_type="difference",
                                               slack_type="on_pairs")
                param_grid["degree"] = [2]
            elif type == "open_circle":
                ranksvm_kernel = KernelRankSVC(verbose=False,
                                               kernel="rbf",
                                               feature_type="difference",
                                               slack_type="on_pairs")
                param_grid["gamma"] = [3]
            else:
                raise ValueError("Invalid test data type: %s" % type)

            cv_inner = GroupKFold(n_splits=3)
            best_params, param_scores, n_pairs_train, best_estimator, _, _ = find_hparan_ranksvm(
                ranksvm_kernel,
                d_X_train,
                d_target_train,
                cv=cv_inner,
                param_grid=param_grid,
                pair_params={
                    "allow_overlap": True,
                    "d_upper": 4,
                    "d_lower": 0,
                    "ireverse": True
                },
                n_jobs=1)
            print(best_params)

            X_test = np.array([d_X[key] for key in keys_test])
            target_test = np.array([d_target[key] for key in keys_test])
            pairs_test = get_pairs_single_system(target_test,
                                                 d_lower=0,
                                                 d_upper=np.inf)

            target_pred[test_set] += best_estimator.map_values(X_test)
            score = best_estimator.score(X_test, pairs_test)
            print(score)
            mean_score += score

        target_pred /= cv.get_n_splits()
        mean_score /= cv.get_n_splits()

        print(mean_score)

        visualize_ranksvm([d_target[key] for key in keys], target_pred,
                          axes[k, 1])

    if not fig_fn is None:
        plt.tight_layout()
        plt.savefig(fig_fn)
    else:
        plt.show()
def compare_datasets3(fig_fn=None):
    fig, axes = plt.subplots(3, 2)
    fig_conv, axes_conv = plt.subplots(3, 3)

    if not fig_fn is None:
        fig.set_size_inches(12, 8)
        fig_conv.set_size_inches(12, 8)

    # for k, type in enumerate (["linear", "quadratic", "open_circle"]):
    for k, type in enumerate(["linear"]):
        print("Type: %s" % type)

        X, target, d_X, d_target = create_artificial_dataset2(type=type, n=400)
        keys = list(d_X.keys())

        visualize_dataset2(X[:, 0],
                           X[:, 1],
                           target,
                           axes[k, 0],
                           title="Dataset: %s" % type)

        param_grid = {"C": [1]}

        train_set, test_set = list(
            ShuffleSplit(n_splits=1, train_size=0.75,
                         test_size=0.25).split(keys))[0]
        keys_train = [keys[idx] for idx in train_set]
        keys_test = [keys[idx] for idx in test_set]

        d_X_train, d_target_train = OrderedDict(), OrderedDict()
        for key in keys_train:
            d_X_train[key] = d_X[key]
            d_target_train[key] = d_target[key]

        if type == "linear":
            ranksvm_kernel = KernelRankSVC(verbose=True,
                                           debug=2,
                                           kernel="linear",
                                           feature_type="difference",
                                           slack_type="on_pairs",
                                           step_size_algorithm="diminishing",
                                           convergence_criteria="gs_change")
        elif type == "quadratic":
            ranksvm_kernel = KernelRankSVC(verbose=True,
                                           debug=2,
                                           kernel="poly",
                                           feature_type="difference",
                                           slack_type="on_examples")
            param_grid["degree"] = [2]
        elif type == "open_circle":
            ranksvm_kernel = KernelRankSVC(verbose=True,
                                           debug=2,
                                           kernel="rbf",
                                           feature_type="difference",
                                           slack_type="on_examples")
            param_grid["gamma"] = [3]
        else:
            raise ValueError("Invalid test data type: %s" % type)

        best_params, param_scores, n_pairs_train, best_estimator, _, _ = find_hparan_ranksvm(
            ranksvm_kernel,
            d_X_train,
            d_target_train,
            cv=None,
            param_grid=param_grid,
            pair_params={
                "allow_overlap": True,
                "d_upper": 8,
                "d_lower": 0,
                "ireverse": True
            },
            n_jobs=1,
            scaler=None)
        print("Best params:", best_params)

        X_test = np.array([d_X[key] for key in keys_test])
        target_test = np.array([d_target[key] for key in keys_test])
        pairs_test = get_pairs_single_system(target_test,
                                             d_lower=0,
                                             d_upper=np.inf)

        target_pred = best_estimator.map_values(X_test)
        print("Score: %f" % best_estimator.score(X_test, pairs_test))

        visualize_ranksvm(target_test, target_pred, axes[k, 1])

        inspect_convergence(best_estimator,
                            np.array(list(d_target_train.values())),
                            axes_conv[k, :])

    if not fig_fn is None:
        plt.tight_layout()
        plt.savefig(fig_fn)
    else:
        plt.show(fig)
        plt.show(fig_conv)
def compare_datasets(fig_fn=None):
    fig, axes = plt.subplots(3, 4)

    if not fig_fn is None:
        fig.set_size_inches(12, 8)

    for k, type in enumerate(["linear", "quadratic", "open_circle"]):
        print("Type: %s" % type)

        X, target = create_artificial_dataset(type=type,
                                              n=50,
                                              random_state=1001)

        # print (X[range(10)])
        # pairs = get_pairs ({"X": X, "target": target})
        # X_diff, y_clf = get_pairwise_features2 (X, pairs, balance_classes = True)
        #
        # # Visualize dataset and feature space
        # visualize_dataset ({"X": X, "target": target}, {"X_diff": X_diff, "y_clf": y_clf}, axes[k])

        # Train a linear rankSVM
        # target_pred_linear = np.zeros (len (X))
        # target_pred_kernel = np.zeros (len (X))
        mean_score = 0.0

        # cv = KFold (n_splits = 10, random_state = 646, shuffle = True)
        cv = PredefinedSplit(np.zeros(len(X)))
        # for k_cv, (_, test_set) in enumerate (cv.split (X)):
        for i_rep in range(10):
            # print ("Fold %d / %d" % (k_cv + 1, cv.n_splits))

            # print (train_set[range(10)], test_set[range(10)])
            #
            train_set = test_set = range(len(X))

            pairs_train = get_pairs_single_system(target[train_set],
                                                  d_lower=0,
                                                  d_upper=4)
            pairs_test = get_pairs_single_system(target[test_set],
                                                 d_lower=0,
                                                 d_upper=np.inf)

            # ranksvm_linear = linear_rank_svm (verbose = False)
            # ranksvm_linear.train (X[train_set], pairs_train, n_jobs = 2, C = [1])
            #
            # target_pred_linear[test_set] = ranksvm_linear.map_values (X[test_set])[0]

            ranksvm_kernel = KernelRankSVC(C=0.1,
                                           verbose=False,
                                           kernel="precomputed",
                                           feature_type="difference")

            if type == "linear":
                KX_train = linear_kernel(X[train_set], X[train_set])
                KX_train_test = linear_kernel(X[train_set], X[test_set])
            elif type == "quadratic":
                KX_train = polynomial_kernel(X[train_set],
                                             X[train_set],
                                             degree=2)
                KX_train_test = polynomial_kernel(X[train_set],
                                                  X[test_set],
                                                  degree=2)
            elif type == "open_circle":
                KX_train = rbf_kernel(X[train_set], X[train_set], gamma=3)
                KX_train_test = rbf_kernel(X[train_set], X[test_set], gamma=3)
            else:
                raise ValueError("Invalid test data type: %s" % type)

            ranksvm_kernel.fit(np.arange(KX_train.shape[0]),
                               None,
                               fit_params={
                                   "KX": KX_train,
                                   "pairs": pairs_train
                               })
            score = ranksvm_kernel.score(KX_train_test, pairs_test)
            mean_score += score

            print(score)
            # target_pred_kernel[test_set] = ranksvm_kernel.map_values (KX_train_test)

        print(mean_score / 10)
        # visualize_ranksvm (target[:, 0], target_pred_linear, axes[k, 2])
        # visualize_ranksvm (target[:, 0], target_pred_kernel, axes[k, 3])

    if not fig_fn is None:
        plt.tight_layout()
        plt.savefig(fig_fn)
    else:
        plt.show()