def calcular_similitud(self, pair_key, lines): """ Se suman los componentes de cada par con todos los usuarios que calificaron los items X y Y del par, luego se calcula la similitud de Pearson. Esta es normalizada entre [0,1] por el sort numerico """ sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_x += item_x sum_y += item_y n += 1 corr_sim = correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = 0.0 yield (item_xname, item_yname), (corr_sim, cos_sim, reg_corr_sim, jaccard_sim, n)
def calculate_similarity(self, pair_key, lines): ''' Sum components of each corating pair across all users who rated both item x and item y, then calculate pairwise pearson similarity and corating counts. The similarities are normalized to the [0,1] scale because we do a numerical sort. 19,21 0.4,2 21,19 0.4,2 19,70 0.6,1 70,19 0.6,1 21,70 0.1,1 70,21 0.1,1 ''' sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 corr_sim = correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = 0.0 yield (item_xname, item_yname), (corr_sim, \ cos_sim, reg_corr_sim, jaccard_sim, n)
def calculate_similarity(self, pair_key, lines): sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 corr_sim = correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = 0.0 yield (item_xname, item_yname), (corr_sim, \ cos_sim, reg_corr_sim, jaccard_sim, n)
def calculate_similarity(self, pair_key, lines): sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) n_x, n_y = 0, 0 item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y, nx_count, ny_count in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 n_x = int(ny_count) n_y = int(nx_count) corr_sim = correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = jaccard(n, n_x, n_y) yield (item_xname, item_yname), (corr_sim, cos_sim, reg_corr_sim, jaccard_sim, n)