def root_mean_square_error(y_real, y_pred): #print "...root_mean_square_error" """ It computes the root mean squared difference (RMSE) between predicted and actual ratings for users. Parameters ---------- y_real : array-like y_pred : array-like Returns ------- Positive floating point value: the best value is 0.0. return the mean square error """ #istart = time.time() y_real, y_pred = check_arrays(y_real, y_pred) rmse = np.sqrt((np.sum((y_pred - y_real) ** 2)) / y_real.shape[0]) #isend = time.time() #print "rmse: ",rmse , "seconds: ", isend-istart return rmse
def mean_absolute_error(y_real, y_pred): #print "...mean_absolute_error" """ It computes the average absolute difference (MAE) between predicted and actual ratings for users. Parameters ---------- y_real : array-like y_pred : array-like Returns ------- Positive floating point value: the best value is 0.0. return the mean absolute error """ istart = time.time() y_real, y_pred = check_arrays(y_real, y_pred) mae = np.sum(np.abs(y_pred - y_real)) / y_real.size isend = time.time() #print "mae: ",mae , "seconds: ", isend-istart return mae
def normalized_mean_absolute_error(y_real, y_pred, max_rating, min_rating): """ It computes the normalized average absolute difference (NMAE) between predicted and actual ratings for users. Parameters ---------- y_real : array-like The real ratings. y_pred : array-like The predicted ratings. max_rating: The maximum rating of the model. min_rating: The minimum rating of the model. Returns ------- Positive floating point value: the best value is 0.0. return the normalized mean absolute error """ y_real, y_pred = check_arrays(y_real, y_pred) mae = mean_absolute_error(y_real, y_pred) return mae / (max_rating - min_rating)
def lcss_dist(X, Y, delta, epsilon): """Compute the LCSS distance between X and Y using Dynamic Programming. :param X (array): time series feature array denoted by X :param Y (array): time series feature array denoted by Y :param delta (int): time sample matching threshold :param epsilon (float): amplitude matching threshold :returns: distance between X and Y with the best alignment :Reference: M Vlachos et al., "Discovering Similar Multidimensional Trajectories", 2002 """ X, Y = check_arrays(X, Y) dist = _lcss_dist(X, Y, delta, epsilon) return dist
def edr_dist(X, Y, epsilon): """Compute the EDR distance between X and Y using Dynamic Programming. :param X (array): time series feature array denoted by X :param Y (array): time series feature array denoted by Y :param epsilon (float): matching threshold :returns: distance between X and Y with the best alignment :Reference: L. Chen et al., "Robust and Fast Similarity Search for Moving Object Trajectories", 2005. """ X, Y = check_arrays(X, Y) X = standardization(X) Y = standardization(Y) dist = _edr_dist(X, Y, epsilon) return dist
def dtw_dist(X, Y, w=_np.inf, mode="dependent"): """Compute multidimensional DTW distance between X and Y using Dynamic Programming. :param X (array): time series feature array denoted by X :param Y (array): time series feature array denoted by Y :param w (int): window size (default=Inf) :param mode (string): "dependent" or "independent" (default="dependent") :returns: distance between X and Y with the best alignment :Reference: https://www.cs.unm.edu/~mueen/DTW.pdf """ X, Y = check_arrays(X, Y) if mode == "dependent": dist = _dtw_dist(X, Y, w) elif mode == "independent": n_feature = X.shape[0] dist = 0 for i in range(n_feature): dist += _dtw_dist(X[[i], :], Y[[i], :], w) else: raise ValueError( "The mode must be either \"dependent\" or \"independent\".") return dist
def root_mean_square_error(y_real, y_pred): """ It computes the root mean squared difference (RMSE) between predicted and actual ratings for users. Parameters ---------- y_real : array-like y_pred : array-like Returns ------- Positive floating point value: the best value is 0.0. return the mean square error """ y_real, y_pred = check_arrays(y_real, y_pred) return np.sqrt((np.sum((y_pred - y_real) ** 2)) / y_real.shape[0])
def mean_absolute_error(y_real, y_pred): """ It computes the average absolute difference (MAE) between predicted and actual ratings for users. Parameters ---------- y_real : array-like y_pred : array-like Returns ------- Positive floating point value: the best value is 0.0. return the mean absolute error """ y_real, y_pred = check_arrays(y_real, y_pred) return np.sum(np.abs(y_pred - y_real)) / y_real.size if y_real.size != 0 else None
def normalized_mean_absolute_error(y_real, y_pred, max_rating, min_rating): #print "...normalized_mean_absolute_error" """ It computes the normalized average absolute difference (NMAE) between predicted and actual ratings for users. Parameters ---------- y_real : array-like The real ratings. y_pred : array-like The predicted ratings. max_rating: The maximum rating of the model. min_rating: The minimum rating of the model. Returns ------- Positive floating point value: the best value is 0.0. return the normalized mean absolute error """ istart = time.time() y_real, y_pred = check_arrays(y_real, y_pred) mae = mean_absolute_error(y_real, y_pred) isend = time.time() #print "nmae: ",mae / (max_rating - min_rating), "seconds: ", isend-istart return mae / (max_rating - min_rating)
def precision_recall_fscore(y_real, y_pred, beta=1.0): """Compute precisions, recalls, f-measures for recommender systems The precision is the ratio :math:`tp / (tp + fp)` where tp is the number of true positives and fp the number of false positives. In recommender systems ... The recall is the ratio :math:`tp / (tp + fn)` where tp is the number of true positives and fn the number of false negatives. In recommender systems... The F_beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F_beta score reaches its best value at 1 and worst score at 0. The F_beta score weights recall beta as much as precision. beta = 1.0 means recall and precision are as important. Parameters ---------- y_real : array, shape = [n_samples] true recommended items y_pred : array, shape = [n_samples] predicted recommended items beta : float, 1.0 by default the strength of recall versus precision in the f-score Returns ------- precision: array, shape = [n_unique_labels], dtype = np.double recall: array, shape = [n_unique_labels], dtype = np.double f1_score: array, shape = [n_unique_labels], dtype = np.double References ---------- http://en.wikipedia.org/wiki/Precision_and_recall """ y_real, y_pred = check_arrays(y_real, y_pred) assert(beta > 0) n_users = y_real.shape[0] precision = np.zeros(n_users, dtype=np.double) recall = np.zeros(n_users, dtype=np.double) fscore = np.zeros(n_users, dtype=np.double) try: # oddly, we may get an "invalid" rather than a "divide" error here old_err_settings = np.seterr(divide='ignore', invalid='ignore') for i, y_items_pred in enumerate(y_pred): intersection_size = np.intersect1d(y_items_pred, y_real[i]).size precision[i] = (intersection_size / float(len(y_real[i]))) \ if len(y_real[i]) else 0.0 recall[i] = (intersection_size / float(len(y_items_pred))) \ if len(y_items_pred) else 0.0 # handle division by 0.0 in precision and recall precision[np.isnan(precision)] = 0.0 recall[np.isnan(precision)] = 0.0 #fbeta Score beta2 = beta ** 2 fscore = (1 + beta2) * (precision * recall) \ / (beta2 * precision + recall) #handle division by 0.0 in fscore fscore[(precision + recall) == 0.0] = 0.0 finally: np.seterr(**old_err_settings) return precision, recall, fscore
def smacof(similarities, metric=True, n_components=2, init=None, n_init=8, n_jobs=1, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF (Scaling by Majorizing a Complicated Function) algorithm The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes a objective function, the *stress*, using a majorization technique. The Stress Majorization, also known as the Guttman Transform, guarantees a monotone convergence of Stress, and is more powerful than traditional techniques such as gradient descent. The SMACOF algorithm for metric MDS can summarized by the following steps: 1. Set an initial start configuration, randomly or not. 2. Compute the stress 3. Compute the Guttman Transform 4. Iterate 2 and 3 until convergence. The nonmetric algorithm adds a monotonic regression steps before computing the stress. Parameters ---------- similarities : symmetric ndarray, shape (n_samples, n_samples) similarities between the points metric : boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components : int, optional, default: 2 number of dimension in which to immerse the similarities overridden if initial array is provided. init : {None or ndarray of shape (n_samples, n_components)}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array n_init : int, optional, default: 8 Number of time the smacof algorithm will be run with different initialisation. The final results will be the best output of the n_init consecutive runs in terms of stress. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose : int, optional, default: 0 level of verbosity eps : float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X : ndarray (n_samples,n_components) Coordinates of the n_samples points in a n_components-space stress : float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) Notes ----- "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; Groenen P. Springer Series in Statistics (1997) "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. Psychometrika, 29 (1964) "Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ similarities, = check_arrays(similarities, sparse_format='dense') random_state = check_random_state(random_state) if hasattr(init, '__array__'): init = np.asarray(init).copy() if not n_init == 1: warnings.warn( 'Explicit initial positions passed: ' 'performing only one init of the MDS instead of %d' % n_init) n_init = 1 best_pos, best_stress = None, None if n_jobs == 1: for it in range(n_init): pos, stress = _smacof_single(similarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=random_state) if best_stress is None or stress < best_stress: best_stress = stress best_pos = pos.copy() else: seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( delayed(_smacof_single)( similarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=seed) for seed in seeds) positions, stress = zip(*results) best = np.argmin(stress) best_stress = stress[best] best_pos = positions[best] return best_pos, best_stress