def test_weighted_distance(self): test_weights = [2, 2, 1, 1] scaled_x1 = [ sqrt(w) * v for w, v in zip(test_weights, self.x1) ] scaled_x2 = [ sqrt(w) * v for w, v in zip(test_weights, self.x2) ] ref_res = euclidean(scaled_x1, scaled_x2) test_res = weighted_euclidean(self.x1, self.x2, test_weights) self.assertEqual(test_res, ref_res)
def test_unweighted_distnace(self): """Test unweighted(or euqally weighted) distance calculation which effectively same to standard euclidean distance """ ref_res = euclidean(self.x1, self.x2) test_res = weighted_euclidean(self.x1, self.x2) self.assertEqual(test_res, ref_res)
def fitted_dist_func(self, x, y): """ Returned the distance functions used in fitting model Returns: -------- func: {function} a function accept (x1, x2, *arg) """ if self._transform_matrix is not None: w = self._transform_matrix g = lambda x, y: weighted_euclidean(x, y, w) return g(x, y)
def _fit(self, X, S, D=None): """ Fit the model with given information: X, S, D Fit the learning distance metrics: (1) if only S is given, all pairs of items in X but not in S are considered as in D; (2) if both S and D given, items in X but neither in S nor in D will be removed from fitting process. Parameters: ---------- X: {matrix-like, np.array}, shape (n_sample, n_features) matrix of observations with 1st column keeping observation ID S: {vector-like, list} a list of tuples which define a pair of data points known as similiar D: {vector-like, list} a list of tuples which define a pair of data points known as different Returns: -------- _trans_vec: {matrix-like, np.array}, shape(n_features, n_features) A transformation matrix (A) _ratio: float """ # if isinstance(X, pd.DataFrame): # X = X.as_matrix() try: # ids = X["ID"] # X = X[[c for c in X.columns if c != "ID"]] ids = X.index.tolist() except ValueError: print "Oops! No 'ID' column is found !" # ids = [int(i) for i in X.ix[:, 0]] # X = X.ix[:, 1:] n_sample, n_features = X.shape bnds = [(0, None)] * n_features # boundaries init = [1] * n_features # initial weights if D == None: all_pairs = [p for p in combinations(ids, 2)] D = get_exclusive_pairs(all_pairs, S) else: # if D is provided, keep only users not being # covered either by S or D covered_items = get_unique_items(S, D) keep_items = [find_index(i, ids) for i in ids \ if i in covered_items] X = X.ix[keep_items, :] # Convert ids in D and S into row index, in order to provide them to # a set of two distance functions, squared_sum_grouped_dist() and # sum_grouped_dist() S_idx = [(find_index(a, ids), find_index(b, ids)) for (a, b) in S] D_idx = [(find_index(a, ids), find_index(b, ids)) for (a, b) in D] def objective_func(w): a = squared_sum_grouped_dist(S_idx, X, w) * 1.0 b = sum_grouped_dist(D_idx, X, w) * 1.0 return a - b if self._is_debug: try: print "Examples of S: %s" % S[:5], len(S) print "Examples of D: %s" % D[:5], len(D) print "Examples of X: %s" % X[:5, :], X.shape except: print "Examples of S: %s" % S, len(S) print "Examples of D: %s" % D, len(D) print "Examples of X: %s" % X, X.shape start_time = time.time() fitted = minimize(objective_func, init, method="L-BFGS-B", bounds=bnds, options={'maxiter':10, 'disp' : True}) duration = time.time() - start_time if self._report_excution_time: print("--- %.2f seconds ---" % duration) w = self._transform_matrix self._transform_matrix = vec_normalized(fitted['x']) # optimized value vs. value of initial setting self._ratio = fitted['fun'] / objective_func(init) self._dist_func = lambda x, y: weighted_euclidean(x, y, w) return (self._transform_matrix, self._ratio)
def _fit(self, X, S, D=None): """ Fit the model with given information: X, S, D Fit the learning distance metrics: (1) if only S is given, all pairs of items in X but not in S are considered as in D; (2) if both S and D given, items in X but neither in S nor in D will be removed from fitting process. Parameters: ---------- X: {matrix-like, np.array}, shape (n_sample, n_features) matrix of observations with 1st column keeping observation ID S: {vector-like, list} a list of tuples which define a pair of data points known as similiar D: {vector-like, list} a list of tuples which define a pair of data points known as different Returns: -------- _trans_vec: {matrix-like, np.array}, shape(n_features, n_features) A transformation matrix (A) _ratio: float """ # if isinstance(X, pd.DataFrame): # X = X.as_matrix() try: # ids = X["ID"] # X = X[[c for c in X.columns if c != "ID"]] ids = X.index.tolist() except ValueError: print "Oops! No 'ID' column is found !" # ids = [int(i) for i in X.ix[:, 0]] # X = X.ix[:, 1:] n_sample, n_features = X.shape bnds = [(0, None)] * n_features # boundaries init = [1] * n_features # initial weights if D == None: all_pairs = [p for p in combinations(ids, 2)] D = get_exclusive_pairs(all_pairs, S) else: # if D is provided, keep only users not being # covered either by S or D covered_items = get_unique_items(S, D) keep_items = [find_index(i, ids) for i in ids \ if i in covered_items] X = X.ix[keep_items, :] # Convert ids in D and S into row index, in order to provide them to # a set of two distance functions, squared_sum_grouped_dist() and # sum_grouped_dist() S_idx = [(find_index(a, ids), find_index(b, ids)) for (a, b) in S] D_idx = [(find_index(a, ids), find_index(b, ids)) for (a, b) in D] def objective_func(w): a = squared_sum_grouped_dist(S_idx, X, w) * 1.0 b = sum_grouped_dist(D_idx, X, w) * 1.0 return a - b if self._is_debug: try: print "Examples of S: %s" % S[:5], len(S) print "Examples of D: %s" % D[:5], len(D) print "Examples of X: %s" % X[:5, :], X.shape except: print "Examples of S: %s" % S, len(S) print "Examples of D: %s" % D, len(D) print "Examples of X: %s" % X, X.shape start_time = time.time() fitted = minimize(objective_func, init, method="L-BFGS-B", bounds=bnds, options={ 'maxiter': 10, 'disp': True }) duration = time.time() - start_time if self._report_excution_time: print("--- %.2f seconds ---" % duration) w = self._transform_matrix self._transform_matrix = vec_normalized(fitted['x']) # optimized value vs. value of initial setting self._ratio = fitted['fun'] / objective_func(init) self._dist_func = lambda x, y: weighted_euclidean(x, y, w) return (self._transform_matrix, self._ratio)