def calculate_similarity(user_feature, article_feature): # NOTE: user_feature is matrix, not support bool operation if user_feature is None: return 0 rst = _cosine_similarity(user_feature, article_feature) if not rst: return 0 return int(rst[0, 0] * 1000)
def batch_calculate_similarity(user_feature, article_matrix): # NOTE: user_feature is matrix, not support bool operation if user_feature is None: return xrange(article_matrix.shape[0]) sims_matrix = _cosine_similarity(user_feature, article_matrix) return sims_matrix[0,].tolist()
def cosine_similarity(x, y): return _cosine_similarity(x, y)[0, 0]
def sub_set_coll_scores(review_set, review_hist, users, restaurants): count = 1 tot = review_set.shape[0] rest_id = None for rid, row in review_set.iterrows(): user_id = row['user_id'] curr_user = users.loc[user_id] old_rest_id = rest_id rest_id = row['business_id'] if old_rest_id != rest_id: review_rest_new = review_set.loc[review_set.business_id == rest_id, rev_cols] review_rest_old = review_hist.loc[review_hist.business_id == rest_id, rev_cols] tmp_review_rest = _pd.concat([review_rest_new, review_rest_old]) tmp_review_rest = tmp_review_rest.groupby('user_id').apply(aggregate) tmp_user_rest = users.loc[users.index.isin(tmp_review_rest.index)] review_rest = tmp_review_rest.drop(user_id) user_rest = tmp_user_rest.drop(user_id) assert review_rest.shape[0] == user_rest.shape[0], "different shapes: " + str(review_rest.shape) + " vs " + str(user_rest.shape) a_u = row['cuisine_av_hist'] a_u_bin = row['cuisine_av_hist_bin'] a_u_real = row['cuisine_av_hist_real'] if user_rest.empty: res = 0 res_bin = 0 res_real = 0 else: a_r = restaurants.loc[rest_id, 'average_stars'] a_u_r = review_rest['stars'] user_sim = _cosine_similarity(curr_user[cols_std].values.reshape(1, -1), user_rest[cols_std]) user_sim = _pd.Series(data=user_sim[0], index=user_rest.index) user_sim.where(user_sim > 0.5, 0, inplace=True) numerator = (user_sim * (a_u_r - a_r)).sum() denominator = user_sim.sum() res = numerator / denominator a_r_bin = restaurants.loc[rest_id, 'average_stars_bin'] a_u_r_bin = review_rest['stars_bin'].fillna(a_r_bin) user_sim = _cosine_similarity(curr_user[cols_bin].values.reshape(1, -1), user_rest[cols_bin]) user_sim = _pd.Series(data=user_sim[0], index=user_rest.index) user_sim.where(user_sim > 0.5, 0, inplace=True) numerator_bin = (user_sim * (a_u_r_bin - a_r_bin)).sum() denominator_bin = user_sim.sum() res_bin = numerator_bin / denominator_bin a_r_real = restaurants.loc[rest_id, 'average_stars_real'] a_u_r_real = review_rest['stars_real'] user_sim = _cosine_similarity(curr_user[cols_real].values.reshape(1, -1), user_rest[cols_real]) user_sim = _pd.Series(data=user_sim[0], index=user_rest.index) user_sim.where(user_sim > 0.5, 0, inplace=True) numerator_real = (user_sim * (a_u_r_real - a_r_real)).sum() denominator_real = user_sim.sum() res_real = numerator_real / denominator_real out_cols = ['coll_score', 'coll_score_bin', 'coll_score_real'] vals = [a_u + res, a_u_bin + res_bin, a_u_real + res_real] review_set.loc[rid, out_cols] = vals if count % 1000 == 0: percent = (count / tot) * 100 print("process {4}\t- row {0}/{1}\t- {2:.3f}%\t- {3}" .format(count, tot, percent, _time.asctime(), _os.getpid())) count += 1