示例#1
0
    def compute_distances(self, x1, x2=None):
        """
        The method
        - extracts normalized continuous attributes and then uses `row_norms`
          and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2
          (the trick from sklearn);
        - calls a function in Cython that recomputes the distances between pairs
          of rows that yielded nan
        - calls a function in Cython that adds the contributions of discrete
          columns
        """
        callbacks = StepwiseCallbacks(self.callback, [20, 10, 50, 5, 15])

        if self.continuous.any():
            data1, data2 = self.continuous_columns(x1, x2, self.means,
                                                   np.sqrt(2 * self.vars))

            # adapted from sklearn.metric.euclidean_distances
            xx = row_norms(data1, squared=True)[:, np.newaxis]
            if x2 is not None:
                yy = row_norms(data2, squared=True)[np.newaxis, :]
            else:
                yy = xx.T
            distances = _safe_sparse_dot(data1,
                                         data2.T,
                                         dense_output=True,
                                         callback=callbacks.next())
            distances *= -2
            distances += xx
            distances += yy
            with np.errstate(invalid="ignore"):  # Nans are fixed below
                np.maximum(distances, 0, out=distances)
            if x2 is None:
                distances.flat[::distances.shape[0] + 1] = 0.0
            fixer = _distance.fix_euclidean_rows_normalized if self.normalize \
                else _distance.fix_euclidean_rows
            fixer(distances, data1, data2, self.means, self.vars,
                  self.dist_missing2_cont, x2 is not None, callbacks.next())
        else:
            distances = np.zeros(
                (x1.shape[0], (x2 if x2 is not None else x1).shape[0]))

        if np.any(self.discrete):
            data1, data2 = self.discrete_columns(x1, x2)
            _distance.euclidean_rows_discrete(distances, data1, data2,
                                              self.dist_missing_disc,
                                              self.dist_missing2_disc, x2
                                              is not None, callbacks.next())

        if x2 is None:
            _distance.lower_to_symmetric(distances, callbacks.next())
        return _interruptible_sqrt(distances, callback=callbacks.next())
示例#2
0
    def compute_distances(self, x1, x2=None):
        """
        The method
        - extracts normalized continuous attributes and then uses `row_norms`
          and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2
          (the trick from sklearn);
        - calls a function in Cython that recomputes the distances between pairs
          of rows that yielded nan
        - calls a function in Cython that adds the contributions of discrete
          columns
        """
        if self.continuous.any():
            data1, data2 = self.continuous_columns(
                x1, x2, self.means, np.sqrt(2 * self.vars))

            # adapted from sklearn.metric.euclidean_distances
            xx = row_norms(data1, squared=True)[:, np.newaxis]
            if x2 is not None:
                yy = row_norms(data2, squared=True)[np.newaxis, :]
            else:
                yy = xx.T
            distances = safe_sparse_dot(data1, data2.T, dense_output=True)
            distances *= -2
            distances += xx
            distances += yy
            with np.errstate(invalid="ignore"):  # Nans are fixed below
                np.maximum(distances, 0, out=distances)
            if x2 is None:
                distances.flat[::distances.shape[0] + 1] = 0.0
            fixer = _distance.fix_euclidean_rows_normalized if self.normalize \
                else _distance.fix_euclidean_rows
            fixer(distances, data1, data2,
                  self.means, self.vars, self.dist_missing2_cont,
                  x2 is not None)
        else:
            distances = np.zeros((x1.shape[0],
                                  (x2 if x2 is not None else x1).shape[0]))

        if np.any(self.discrete):
            data1, data2 = self.discrete_columns(x1, x2)
            _distance.euclidean_rows_discrete(
                distances, data1, data2, self.dist_missing_disc,
                self.dist_missing2_disc, x2 is not None)

        if x2 is None:
            _distance.lower_to_symmetric(distances)
        return np.sqrt(distances)
示例#3
0
    def compute_distances(self, x1, x2):
        """
        The method
        - extracts normalized continuous attributes and computes distances
          ignoring the possibility of nans
        - recomputes the distances between pairs of rows that yielded nans
        - adds the contributions of discrete columns using the same function as
          the Euclidean distance
        """
        callbacks = StepwiseCallbacks(self.callback, [5, 5, 60, 30])

        if self.continuous.any():
            data1, data2 = self.continuous_columns(x1, x2, self.medians,
                                                   2 * self.mads)
            distances = _distance.manhattan_rows_cont(data1, data2, x2
                                                      is not None,
                                                      callbacks.next())
            if self.normalize:
                _distance.fix_manhattan_rows_normalized(
                    distances, data1, data2, x2 is not None, callbacks.next())
            else:
                _distance.fix_manhattan_rows(distances, data1, data2,
                                             self.medians, self.mads,
                                             self.dist_missing2_cont, x2
                                             is not None, callbacks.next())
        else:
            distances = np.zeros(
                (x1.shape[0], (x2 if x2 is not None else x1).shape[0]))

        if np.any(self.discrete):
            data1, data2 = self.discrete_columns(x1, x2)
            # For discrete attributes, Euclidean is same as Manhattan
            _distance.euclidean_rows_discrete(distances, data1, data2,
                                              self.dist_missing_disc,
                                              self.dist_missing2_disc, x2
                                              is not None, callbacks.next())

        if x2 is None:
            _distance.lower_to_symmetric(distances, callbacks.next())
        return distances
示例#4
0
    def compute_distances(self, x1, x2):
        """
        The method
        - extracts normalized continuous attributes and computes distances
          ignoring the possibility of nans
        - recomputes the distances between pairs of rows that yielded nans
        - adds the contributions of discrete columns using the same function as
          the Euclidean distance
        """
        if self.continuous.any():
            data1, data2 = self.continuous_columns(
                x1, x2, self.medians, 2 * self.mads)
            distances = _distance.manhattan_rows_cont(
                data1, data2, x2 is not None)
            if self.normalize:
                _distance.fix_manhattan_rows_normalized(
                    distances, data1, data2, x2 is not None)
            else:
                _distance.fix_manhattan_rows(
                    distances, data1, data2,
                    self.medians, self.mads, self.dist_missing2_cont,
                    x2 is not None)
        else:
            distances = np.zeros((x1.shape[0],
                                  (x2 if x2 is not None else x1).shape[0]))

        if np.any(self.discrete):
            data1, data2 = self.discrete_columns(x1, x2)
            # For discrete attributes, Euclidean is same as Manhattan
            _distance.euclidean_rows_discrete(
                distances, data1, data2, self.dist_missing_disc,
                self.dist_missing2_disc, x2 is not None)

        if x2 is None:
            _distance.lower_to_symmetric(distances)
        return distances