Exemplo n.º 1
0
    def test_multiple_system_dataset_default_d_range(self):
        # ----------------------------------------------
        targets = Labels([6, 2, 3, 4, 1, 1, 10, 5, 12],
                         ["A", "A", "A", "A", "A", "B", "B", "B", "B"])
        n_samples_A = 5
        n_samples_B = 4
        d_pairs_ref = {
            np.inf: [(1, 4), (1, 2), (2, 3), (0, 3), (5, 7), (6, 7), (6, 8),
                     (2, 4), (1, 3), (0, 2), (5, 6), (7, 8), (3, 4), (0, 1),
                     (5, 8), (0, 4)]
        }
        d_signs_ref = {
            np.inf: [1, -1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1]
        }

        for d in d_pairs_ref.keys():
            pairs, signs, _ = get_pairs_multiple_datasets(targets)

            self.assertEqual(len(d_pairs_ref[d]), len(pairs))
            self.assertEqual(len(d_signs_ref[d]), len(signs))
            for pair, sign in zip(d_pairs_ref[d], d_signs_ref[d]):
                self.assertIn(pair, pairs)
                self.assertEqual(sign, signs[pairs.index(pair)])

        pairs, _, pdss = get_pairs_multiple_datasets(targets)
        max_n_pairs_A_ref = int((n_samples_A**2 - n_samples_A) / 2)
        max_n_pairs_B_ref = int((n_samples_B**2 - n_samples_B) / 2)
        self.assertEqual(max_n_pairs_A_ref + max_n_pairs_B_ref, len(pairs))
        self.assertEqual(["A"] * max_n_pairs_A_ref + ["B"] * max_n_pairs_B_ref,
                         pdss)

        # ----------------------------------------------
        targets = Labels([1, 2, 3, 4, 1, 2, 1.5],
                         ["A", "A", "A", "A", "B", "B", "B"])
        n_samples_A = 4
        n_samples_B = 3
        d_pairs_ref = {
            1: [(0, 3), (0, 2), (1, 3), (1, 2), (1, 3), (2, 3), (4, 5), (4, 6),
                (5, 6)]
        }
        d_signs_ref = {1: [-1, -1, -1, -1, -1, -1, -1, -1, 1]}

        for d in d_pairs_ref.keys():
            pairs, signs, _ = get_pairs_multiple_datasets(targets)

            self.assertEqual(len(d_pairs_ref[d]), len(pairs))
            self.assertEqual(len(d_signs_ref[d]), len(signs))
            for pair, sign in zip(d_pairs_ref[d], d_signs_ref[d]):
                self.assertIn(pair, pairs)
                self.assertEqual(sign, signs[pairs.index(pair)])

        pairs, _, pdss = get_pairs_multiple_datasets(targets)
        max_n_pairs_A_ref = int((n_samples_A**2 - n_samples_A) / 2)
        max_n_pairs_B_ref = int((n_samples_B**2 - n_samples_B) / 2)
        self.assertEqual(max_n_pairs_A_ref + max_n_pairs_B_ref, len(pairs))
        self.assertEqual(["A"] * max_n_pairs_A_ref + ["B"] * max_n_pairs_B_ref,
                         pdss)
Exemplo n.º 2
0
    def test_single_system_dataset_default_d_range(self):
        # ----------------------------------------------
        targets = Labels([10, 4, 6, 8, 2], ["A", "A", "A", "A", "A"])
        n_samples = len(targets)
        d_pairs_ref = {
            np.inf: [(1, 4), (1, 2), (2, 3), (0, 3), (2, 4), (1, 3), (0, 2),
                     (3, 4), (0, 1), (0, 4)]
        }
        d_signs_ref = {np.inf: [1, -1, -1, 1, 1, -1, 1, 1, 1, 1]}

        for d in d_pairs_ref.keys():
            pairs, signs, pdss = get_pairs_multiple_datasets(targets)

            self.assertEqual(len(d_pairs_ref[d]), len(pairs))
            self.assertEqual(len(d_signs_ref[d]), len(signs))
            self.assertTrue(all([pds == "A" for pds in pdss]))
            for pair, sign in zip(d_pairs_ref[d], d_signs_ref[d]):
                self.assertIn(pair, pairs)
                self.assertEqual(sign, signs[pairs.index(pair)])

        pairs, _, _ = get_pairs_multiple_datasets(targets)
        self.assertEqual((n_samples**2 - n_samples) / 2, len(pairs))

        # ----------------------------------------------
        d_pairs_ref = {
            1: [(0, 4), (3, 4), (0, 1), (2, 4), (1, 3), (0, 2), (1, 4), (1, 2),
                (2, 3), (0, 3)]
        }
        d_signs_ref = {1: [1, 1, 1, 1, -1, 1, 1, -1, -1, 1]}

        for d in d_pairs_ref.keys():
            pairs, signs, pdss = get_pairs_multiple_datasets(targets)

            self.assertEqual(len(d_pairs_ref[d]), len(pairs))
            self.assertEqual(len(d_signs_ref[d]), len(signs))
            self.assertTrue(all([pds == "A" for pds in pdss]))
            for pair, sign in zip(d_pairs_ref[d], d_signs_ref[d]):
                self.assertIn(pair, pairs)
                self.assertEqual(sign, signs[pairs.index(pair)])

        pairs, _, _ = get_pairs_multiple_datasets(targets)
        self.assertEqual((n_samples**2 - n_samples) / 2, len(pairs))
Exemplo n.º 3
0
    def test_bordercases(self):
        # ----------------------------------------------
        pairs, signs, pdss = get_pairs_multiple_datasets(
            Labels([1, 2, 3], [1, 2, 3]))

        self.assertEqual(0, len(pairs))
        self.assertEqual(0, len(signs))
        self.assertEqual(0, len(pdss))

        # ----------------------------------------------
        pairs, signs, pdss = get_pairs_multiple_datasets(
            Labels([1, 2, 3], ["A", "B", "C"]))

        self.assertEqual(0, len(pairs))
        self.assertEqual(0, len(signs))
        self.assertEqual(0, len(pdss))

        # ----------------------------------------------
        pairs, signs, pdss = get_pairs_multiple_datasets(
            Labels([2, 3, 2, 3], [1, 1, 2, 2]))

        self.assertEqual(2, len(pairs))
        self.assertEqual(2, len(signs))
        self.assertEqual(2, len(pdss))
        self.assertEqual([(0, 1), (2, 3)], pairs)
        self.assertEqual([-1, -1], signs)
        self.assertEqual([1, 2], pdss)

        # ----------------------------------------------
        pairs, signs, pdss = get_pairs_multiple_datasets(
            Labels([2, 3, 2, 3], ["B", "B", "A", "A"]))

        self.assertEqual(2, len(pairs))
        self.assertEqual(2, len(signs))
        self.assertEqual(2, len(pdss))
        self.assertEqual([(0, 1), (2, 3)], pairs)
        self.assertEqual([-1, -1], signs)
        self.assertEqual(["B", "A"], pdss)

        # ----------------------------------------------
        pairs, signs, pdss = get_pairs_multiple_datasets(
            Labels([2, 3, 1, 3, 3], [1, 1, 1, 2, 2]))

        self.assertEqual(3, len(pairs))
        self.assertEqual(3, len(signs))
        self.assertEqual(3, len(pdss))
        self.assertEqual([(0, 1), (0, 2), (1, 2)], pairs)
        self.assertEqual([-1, 1, 1], signs)
        self.assertEqual([1, 1, 1], pdss)

        # ----------------------------------------------
        pairs, signs, pdss = get_pairs_multiple_datasets(
            Labels([2, 3, 1, 3, 3], ["A", "A", "A", "B", "B"]))

        self.assertEqual(3, len(pairs))
        self.assertEqual(3, len(signs))
        self.assertEqual(3, len(pdss))
        self.assertEqual([(0, 1), (0, 2), (1, 2)], pairs)
        self.assertEqual([-1, 1, 1], signs)
        self.assertEqual(["A", "A", "A"], pdss)
Exemplo n.º 4
0
    def fit(self, X: np.ndarray, y: Labels, groups=None) -> RANKSVM_T:
        """
        Estimating the parameters of the dual ranking svm with scaled margin.
        The conditional gradient descent algorithm is used to find the optimal
        alpha vector.

        :param X: array-like, shape = (n_samples, n_features) or (n_samples, n_samples)
            Object features or object similarities (kernel). If self.kernel == "precomputed"
            then X is interpreted as symmetric kernel matrix, otherwise as feature matrix.
            In this case the kernel is calculated on the fly.

        :param y: list of tuples, length = n_samples, the targets values, e.g. retention times,
            for all molecules measured with a set of datasets.

            Example:
            [..., (rts_i, ds_i), (rts_j, ds_j), (rts_k, ds_k), ...]

            rts_i ... retention time of measurement i
            ds_i ... identifier of the dataset of measurement i
        """
        rs = check_random_state(self.random_state)

        if self.debug:
            if self.kernel == "precomputed":
                raise ValueError(
                    "Precomputed kernels cannot be provided in the debug mode."
                )

            # Separate 15% of the data into the validation set
            X, X_val, y, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.15,
                                                  random_state=rs)

            self.debug_data_ = {
                "train_score": [],
                "val_score": [],
                "primal_obj": [],
                "dual_obj": [],
                "duality_gap": [],
                "step_size": [],
                "step": [],
                "alpha": [],
                "norm_s_minus_alpha": [],
                "n_nonzero_s": [],
                "convergence_criteria": "max_iter"
            }
        else:
            X_val, y_val = None, None

        # Handle training data and calculate kernels if needed
        if self.kernel == "precomputed":
            if X.shape[0] != X.shape[1]:
                raise ValueError(
                    "Precomputed kernel matrix must be squared: You provided KX.shape = (%d, %d)."
                    % (X.shape[0], X.shape[1]))
            self.KX_train_ = X
        else:
            self.X_train_ = X
            self.KX_train_ = self._get_kernel(self.X_train_)

        # Generate training pairs
        select_random_pairs = False
        pair_params = {"d_upper": np.inf, "d_lower": 1}
        if self.pair_generation == "eccb":
            pair_params["d_upper"] = 16
        elif self.pair_generation == "random":
            select_random_pairs = True

        self.pairs_train_, self.py_train_, self.pdss_train_ = get_pairs_multiple_datasets(
            y, d_lower=pair_params["d_lower"], d_upper=pair_params["d_upper"])

        if select_random_pairs:
            _idc = rs.choice(range(len(self.pairs_train_)),
                             size=self._get_p_perc(len(self.pairs_train_), 5),
                             replace=False)
            self.pairs_train_ = [self.pairs_train_[idx] for idx in _idc]
            self.py_train_ = [self.py_train_[idx] for idx in _idc]
            self.pdss_train_ = [self.pdss_train_[idx] for idx in _idc]

        if self.pairwise_features == "difference":
            self.A_ = self._build_A_matrix(self.pairs_train_, self.py_train_,
                                           self.KX_train_.shape[0])
        elif self.pairwise_features == "exterior_product":
            self.P_0_, self.P_1_ = self._build_P_matrices(
                self.pairs_train_, self.KX_train_.shape[0])
        else:
            raise ValueError(
                "Invalid pairwise feature: '%s'. Choices are 'difference' or 'exterior_product'"
                % self.pairwise_features)

        # Initialize alpha: all dual variables are equal C
        self.alpha_ = np.full(len(self.pairs_train_),
                              fill_value=self.C)  # shape = (n_pairs_train, )

        k = 0
        converged = False
        while k < self.max_iter:
            s, grad = self._solve_sub_problem(
                self.alpha_)  # feasible update direction

            if (k == 0) and (self.conv_criteria == "rel_duality_gap_decay"):
                duality_gap_0 = self._get_duality_gap(s, self.alpha_, grad)

            # Get the step-size
            if self.step_size == "diminishing":
                tau, duality_gap = self._get_step_size_diminishing(k), None
            elif self.step_size == "linesearch":
                tau, duality_gap = self._get_step_size_linesearch(self.alpha_,
                                                                  s,
                                                                  grad=grad)
            else:
                raise ValueError(
                    "Invalid step-size method: '%s'. Choices are 'diminishing' and 'linesearch'."
                    % self.step_size)

            if tau <= 0:
                msg = "k = %d, %s step-size <= 0 (tau = %.5f)." % (
                    k, self.step_size, tau)
                print("Converged:", msg)
                converged = True

            if ((k % 5 == 0) and (k <= 100)) or ((k % 10 == 0) and (k > 100)):
                if self.conv_criteria == "rel_duality_gap_decay":
                    if duality_gap is None:
                        duality_gap = self._get_duality_gap(
                            s, self.alpha_, grad)

                    if (duality_gap /
                            duality_gap_0) <= self.duality_gap_threshold:
                        msg = "k = %d, Relative duality gap (to gap_0) below threshold: gap / gap_0 = %.5f <= %.5f" \
                              % (k, duality_gap / duality_gap_0, self.duality_gap_threshold)
                        converged = True

                if self.debug:
                    prim, dual, prim_dual_gap = self._evaluate_primal_and_dual_objective(
                        self.alpha_)

                    # Validation and training scores
                    train_score = self.score(self.KX_train_,
                                             y,
                                             X_is_kernel_input=True)
                    self.debug_data_["train_score"].append(train_score)
                    self.debug_data_["val_score"].append(
                        self.score(X_val, y_val))

                    # Objective values
                    self.debug_data_["primal_obj"].append(prim)
                    self.debug_data_["dual_obj"].append(dual)
                    self.debug_data_["duality_gap"].append(prim_dual_gap)

                    # General information about the convergence
                    self.debug_data_["step"].append(k)
                    self.debug_data_["step_size"].append(tau)
                    self.debug_data_["alpha"].append(self.alpha_)
                    self.debug_data_["norm_s_minus_alpha"].append(
                        np.linalg.norm(s - self.alpha_))
                    self.debug_data_["n_nonzero_s"].append(np.sum(s > 0))

            if converged:
                if self.debug:
                    print("Converged:", msg)
                    self.debug_data_["convergence_criteria"] = msg
                break

            self.alpha_ = self.alpha_ + tau * (
                s - self.alpha_)  # update alpha^{(k)} --> alpha^{(k + 1)}

            self._assert_is_feasible(self.alpha_)

            k += 1

        # Threshold dual variables to the boarder ranges, if there are very close to it.
        self.alpha_ = self._bound_alpha(self.alpha_, self.alpha_threshold, 0,
                                        self.C)
        self._assert_is_feasible(self.alpha_)

        if self.debug:
            # After alpha threshold, we add one more debug-info, as values might have changed a bit.
            prim, dual, prim_dual_gap = self._evaluate_primal_and_dual_objective(
                self.alpha_)

            # Validation and training scores
            train_score = self.score(self.KX_train_, y, X_is_kernel_input=True)
            self.debug_data_["train_score"].append(train_score)
            self.debug_data_["val_score"].append(self.score(X_val, y_val))

            # Objective values
            self.debug_data_["primal_obj"].append(prim)
            self.debug_data_["dual_obj"].append(dual)
            self.debug_data_["duality_gap"].append(prim_dual_gap)

            # General information about the convergence
            self.debug_data_["step"].append(k + 1)
            self.debug_data_["step_size"].append(
                self.debug_data_["step_size"][-1])
            self.debug_data_["alpha"].append(self.alpha_)
            self.debug_data_["norm_s_minus_alpha"].append(
                self.debug_data_["norm_s_minus_alpha"][-1])
            self.debug_data_["n_nonzero_s"].append(
                self.debug_data_["n_nonzero_s"][-1])

            self.debug_data_ = {
                key: np.array(value)
                for key, value in self.debug_data_.items()
            }

        # Only store information related to the support vectors
        is_sv = (self.alpha_ > 0)
        if self.pairwise_features == "difference":
            self.A_ = self.A_[is_sv]
        elif self.pairwise_features == "exterior_product":
            self.P_0_ = self.P_0_[is_sv]
            self.P_1_ = self.P_1_[is_sv]
        self.alpha_ = self.alpha_[is_sv]
        self.pairs_train_ = [
            self.pairs_train_[idx] for idx, _is_sv in enumerate(is_sv)
            if _is_sv
        ]
        self.py_train_ = [
            self.py_train_[idx] for idx, _is_sv in enumerate(is_sv) if _is_sv
        ]
        self.pdss_train_ = [
            self.pdss_train_[idx] for idx, _is_sv in enumerate(is_sv) if _is_sv
        ]

        self.k_ = k

        return self