def gather_candidate_products(self, n_recommendations): product_ids_set = set() if not config.is_anonymous(self.session_context.user_id): for strength, template_id in self.session_context.user_templates: template_user_activities = self.session_context.recent_activities_by_template_user.get(template_id, {}) product_ids = {act["external_product_id"] for act in template_user_activities} product_ids_set |= product_ids return {self.get_suffix(): product_ids_set}
def update_summaries(session_context, activity): """ Performs the following updates: 1) If the informed activity is the first popularity-defining activity for that (user, product) pair, increments the popularity of the given product, and sets the popularity flag of that (user, pair) to True in the activities_summary. 2) Resets the impressions count for that (user, product) pair. 3) Updates the activities summary for that (user, product) pair. :param session_context: The session context. :param activity: The activity being processed, i.e., the trigger to the updates. """ user = activity["external_user_id"] is_anonymous = config.is_anonymous(user) product = activity["external_product_id"] act_type = activity["activity"] act_rating = session_context.rating_by_activity.get(act_type) date = activity["created_at"] log.info("Processing summaries [user=%s, product=%s, type=%s]..." % (user, product, act_type)) start = time() u_p_activity_summary = _get_current_user_product_summary(session_context, user, product, anonymous=is_anonymous) # Product popularity (if need be) should_increment_popularity = act_rating >= session_context.min_rating_recommendable_from_user if u_p_activity_summary is not None and u_p_activity_summary["contributed_for_popularity"]: should_increment_popularity = False log.info("Updating product {0} popularity...".format(product)) session_context.data_proxy.update_product_popularity( product, date, should_increment_popularity) # Impressions summary log.info("Resetting impressions for user/product pair ({0}, {1})...".format(user, product)) session_context.data_proxy.reset_impression_summary( user, product, anonymous=is_anonymous) # Activities summary log.info("Updating activities summary for user/product pair ({0}, {1})...".format(user, product)) session_context.data_proxy.save_activity_summary( activity, set_popularity_flag=should_increment_popularity, anonymous=is_anonymous) log.info("---Done processing summaries [user=%s, product=%s, type=%s] (took %.6f seconds)" % (user, product, act_type, time() - start))
def test_user_user_strengths_incremental_with_new_impressions_random(self): """ Tests whether the user x user strengths generated on a step-by-step basis match exactly those created from scratch. This test saves several random activities in a row, checking whether all strengths were correctly updated. """ if not tests.INCLUDE_RANDOM_TESTS: return all_users = [u for u in self.db_proxy.fetch_all_user_ids()] all_products = [p for p in self.db_proxy.fetch_all_product_ids()] for i in range(100): user = random.choice(all_users) is_anonymous = config.is_anonymous(user) print("user: %s" % user) # Saves a couple of impressions for the chosen user date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00")) + dt.timedelta(seconds=2 * i) product1 = random.choice(all_products) product2 = random.choice(all_products) self.db_proxy.increment_impression_summary(user_id=user, product_id=product1, date=date, anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user_id=user, product_id=product2, date=date, anonymous=is_anonymous) print("impressions --> %s, %s" % (product1, product2)) ut.generate_templates(self.session_context) # it is important to regenerate from scratch (with all new impressions) # Saves one activity for that same user product3 = random.choice(all_products) activity_type = random.choice(self.session_context.supported_activities) date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00")) + dt.timedelta(seconds=2 * i + 1) activity = {"external_user_id": user, "external_product_id": product3, "activity": activity_type, "created_at": date} print("activity --> " + str(activity)) ut.update_templates(self.session_context, activity) tasks.update_summaries(self.session_context, activity) self.compare_incremental_vs_from_scratch( target_users=[user] if self.session_context.impressions_enabled else None)
def process_impression(session_context, impression): user = impression["external_user_id"] is_anonymous = config.is_anonymous(user) product = impression["external_product_id"] date = impression["created_at"] log.info("Processing impression [user=%s, product=%s]..." % (user, product)) start = time() try: session_context.data_proxy.increment_impression_summary(user, product, date, anonymous=is_anonymous) log.info("---Done processing impression [user=%s, product=%s] (took %.6f seconds)" % (user, product, time() - start)) except Exception as ex: log.error("Error while processing impression [user=%s, product=%s], message=%s, stack_trace=%s" % (user, product, ex.args[0], traceback.format_exc()))
def gather_recommendation_scores(self, candidate_product_ids_by_algorithm, n_recommendations): scores_by_recommendation_candidate = {} if not config.is_anonymous(self.session_context.user_id): candidates = self.pick_candidate_products(candidate_product_ids_by_algorithm) for strength, template_id in self.session_context.user_templates: template_user_activities = self.session_context.recent_activities_by_template_user.get(template_id, {}) products = { act["external_product_id"] for act in template_user_activities if self.session_context.rating_by_activity[act["activity"]] >= self.session_context.min_rating_recommendable_from_user } for product_id in products: if product_id in candidates: score = scores_by_recommendation_candidate.get(product_id, [[0.0], product_id]) score_increment = self.calculate_score(strength, product_id, template_id) if score_increment != 0: score[0][0] += score_increment scores_by_recommendation_candidate[product_id] = score return scores_by_recommendation_candidate.values()
def update_collaborative_filtering_strengths(session_context, activity): """ Updates user-user strengths and product-product strengths in conformity to the informed activity. :param session_context: The session context. :param activity: The activity which triggered the updates. """ user = activity["external_user_id"] is_anonymous = config.is_anonymous(user) if is_anonymous: return # we do NOT want anonymous users to influence collaborative filtering strengths! product = activity["external_product_id"] act_type = activity["activity"] log.info("Processing strengths [user=%s, product=%s, type=%s]..." % (user, product, act_type)) start = time() u_p_activity_summary = _get_current_user_product_summary(session_context, user, product, anonymous=False) first_impression_date = None if session_context.impressions_enabled: product_user_impressions_summary = session_context.data_proxy.fetch_impressions_summary( product_ids=[product], user_ids=[user], group_by_product=True, anonymous=False).get(product, {}).get(user, (0, None)) first_impression_date = product_user_impressions_summary[1] log.info("Updating user-user strengths affected by user/product pair ({0}, {1})...".format(user, product)) ut.update_templates(session_context, activity, u_p_activity_summary, first_impression_date, should_lookup_activities_summary=False, should_lookup_first_impression=False) log.info("Updating product-product strengths affected by user/product pair ({0}, {1})...".format(user, product)) pt.update_templates(session_context, activity, u_p_activity_summary, first_impression_date, should_lookup_activities_summary=False, should_lookup_first_impression=False) log.info("---Done processing strengths [user=%s, product=%s, type=%s] (took %.6f seconds)" % (user, product, act_type, time() - start))
def update_templates( session_context, new_activity, u_p_activities_summary=None, first_impression_date=None, should_lookup_activities_summary=True, should_lookup_first_impression=True, ): """ Updates product x product strengths based on a single new activity. :param session_context: The session context. :param new_activity: a dict {"external_user_id": user_id, "external_product_id": product_id, "activity": activity_type, "created_at": datetime}. :param u_p_activities_summary: The summary of activities for that (user, product) pair, if any, in the form of a dict {"external_user_id": the user id, "external_product_id": the product id, "activity": the latest activity type, "created_at": the datetime of the latest activity, "pp_latest_type": the type of the latest activity to be processed for that pair during p-p strengths calculation, "pp_latest_date": the date of the latest activity to be processed for that pair during p-p strengths calculation}. :param first_impression_date: The date of the first impression, if any, the activity user has received on the activity product. :param should_lookup_activities_summary: If True and previous_activity is None, it queries the database for the previous activity. :param should_lookup_first_impression: If True and first_impression_date is None, it queries the database for the first impression. """ log.info("Computing product-product strengths...") user = new_activity["external_user_id"] if config.is_anonymous(user): log.info("Anonymous users should not affect product-product strengths! Exiting now.") return product = new_activity["external_product_id"] activity_date = new_activity["created_at"] activity_type = new_activity["activity"] rating = session_context.rating_by_activity.get(activity_type) if rating is None: log.error("Unsupported activity type: %s" % activity_type) return suggested_cutoff_date = session_context.get_present_date() - dt.timedelta( session_context.product_product_strengths_window ) latest_batch_info = session_context.data_proxy.fetch_latest_batch_info_product_product_strengths() if latest_batch_info is not None: latest_batch_timestamp = latest_batch_info["timestamp"] persisted_cutoff_date = latest_batch_info.get("cutoff_date") if persisted_cutoff_date is None: cutoff_date = suggested_cutoff_date else: cutoff_date = max(persisted_cutoff_date, suggested_cutoff_date) else: latest_batch_timestamp = None cutoff_date = suggested_cutoff_date if session_context.impressions_enabled and first_impression_date is None and should_lookup_first_impression: product_user_impressions_summary = ( session_context.data_proxy.fetch_impressions_summary( product_ids=[product], user_ids=[user], group_by_product=True, anonymous=False ) .get(product, {}) .get(user, (0, None)) ) first_impression_date = product_user_impressions_summary[1] if u_p_activities_summary is None and should_lookup_activities_summary: u_p_activities_summary_as_singleton_list = session_context.data_proxy.fetch_activity_summaries_by_user( user_ids=[user], product_ids=[product], indexed_fields_only=False, anonymous=False ).get(user, []) if len(u_p_activities_summary_as_singleton_list) > 0: u_p_activities_summary = u_p_activities_summary_as_singleton_list[0] previous_activity_rating = 0 if u_p_activities_summary is not None: previous_activity_type = u_p_activities_summary.get("pp_latest_type") if previous_activity_type is not None: previous_activity_rating = session_context.rating_by_activity[previous_activity_type] previous_activity_date = u_p_activities_summary["pp_latest_date"] if previous_activity_rating == rating and not session_context.impressions_enabled: return # repeating the latest activity --- there is nothing to do here # (if using impressions, must recalculate anyway to account for latest impressions) numerator_diff = [0, 0] denominator_diff = 0 remove_previous_activity_contribution = previous_activity_rating >= min( session_context.min_rating_conservative, session_context.min_rating_recommendable_from_product ) if remove_previous_activity_contribution: if session_context.impressions_enabled: if first_impression_date is not None: # must remove former contribution if impression was already processed incrementally remove_previous_activity_contribution = previous_activity_date >= first_impression_date # must remove also if generation from scratch happened after the first impression if not remove_previous_activity_contribution and latest_batch_timestamp is not None: remove_previous_activity_contribution = latest_batch_timestamp >= first_impression_date # Removes the former contribution of the previous commanding activity for that (user, product) pair. if remove_previous_activity_contribution: if previous_activity_rating >= session_context.min_rating_conservative: numerator_diff[CONSERVATIVE] -= 1 if previous_activity_rating >= session_context.min_rating_aggressive: numerator_diff[AGGRESSIVE] -= 1 if previous_activity_rating >= session_context.min_rating_recommendable_from_product: denominator_diff -= 1 # Adds the contribution of this activity. if rating >= session_context.min_rating_conservative: numerator_diff[CONSERVATIVE] += 1 if rating >= session_context.min_rating_aggressive: numerator_diff[AGGRESSIVE] += 1 if rating >= session_context.min_rating_recommendable_from_product: denominator_diff += 1 # Fetches all the products consumed by this user. products_by_rating = session_context.data_proxy.fetch_products_by_rating_by_user( user_ids=[user], min_date=cutoff_date, max_date=session_context.get_present_date() )[0].get(user, {}) # Includes the product of the current activity (remember: this activity might not have been saved yet) products_set = products_by_rating.get(rating, set()) products_set.add(product) products_by_rating[rating] = products_set if u_p_activities_summary is not None: products_set = products_by_rating.get(previous_activity_rating, set()) if product in products_set: products_set.remove(product) products_by_rating[previous_activity_rating] = products_set products_rated_conservatively_high = set() for r in range(session_context.min_rating_conservative, 6): products_rated_conservatively_high |= products_by_rating.get(r, set()) products_rated_aggressively_high = set() for r in range(session_context.min_rating_aggressive, 6): products_rated_aggressively_high |= products_by_rating.get(r, set()) products_rated_sufficiently_for_recommendation = set() for r in range(session_context.min_rating_recommendable_from_product, 6): products_rated_sufficiently_for_recommendation |= products_by_rating.get(r, set()) numerators_with_product_as_template = None denominators_with_product_as_template = None numerators_with_product_as_base = None denominators_with_product_as_base = None strengths_map_for_insert = {} strengths_map_for_update = {} # This product as TEMPLATE # If this product has been consumed by this user without previous impressions, then it shall not contribute # for product-product strengths with this product as template. update_product_as_template = True if session_context.impressions_enabled: update_product_as_template = first_impression_date is not None # Existing pairs with product as template. if update_product_as_template and numerator_diff != [0, 0]: strength_operands_with_product_as_template = session_context.data_proxy.fetch_product_product_strength_operands( templates=[product] ) numerators_with_product_as_template = strength_operands_with_product_as_template[0] denominators_with_product_as_template = strength_operands_with_product_as_template[1] for product_and_template, numerator_tuple in numerators_with_product_as_template.items(): base_product = product_and_template[0] if base_product in products_rated_sufficiently_for_recommendation: new_numerator_tuple = [numerator_tuple[0] + numerator_diff[0], numerator_tuple[1] + numerator_diff[1]] numerators_with_product_as_template[product_and_template] = new_numerator_tuple update_doc = strengths_map_for_update.get(product_and_template, {}) update_doc["nc"] = new_numerator_tuple[CONSERVATIVE] update_doc["na"] = new_numerator_tuple[AGGRESSIVE] strengths_map_for_update[product_and_template] = update_doc # New pairs with product as template. if update_product_as_template and numerator_diff[0] == 1: # if this user has *just* rated this product high... new_base_products = [] for base_product in products_rated_sufficiently_for_recommendation: if base_product != product and (base_product, product) not in numerators_with_product_as_template: new_base_products += [base_product] new_numerator_tuple = [ 1 if rating >= session_context.min_rating_conservative else 0, 1 if rating >= session_context.min_rating_aggressive else 0, ] numerators_with_product_as_template[(base_product, product)] = new_numerator_tuple update_doc = strengths_map_for_insert.get((base_product, product), {}) update_doc["nc"] = new_numerator_tuple[CONSERVATIVE] update_doc["na"] = new_numerator_tuple[AGGRESSIVE] strengths_map_for_insert[(base_product, product)] = update_doc users_by_rating_by_new_base_product = session_context.data_proxy.fetch_users_by_rating_by_product( product_ids=new_base_products, min_date=cutoff_date, max_date=session_context.get_present_date() )[0] for new_base_product in new_base_products: source_users = set() for r in range(session_context.min_rating_recommendable_from_product, 6): source_users |= users_by_rating_by_new_base_product[new_base_product][r] if session_context.impressions_enabled: # Retrieves the intersection of the top-rated users of the base product # with the users with impressions for the template product source_users_with_impressions = session_context.data_proxy.fetch_users_with_impressions_by_product( product_ids=[product], user_ids=list(source_users), anonymous=False ).get(product, set()) new_denominator = len(source_users_with_impressions) else: new_denominator = len(source_users) denominators_with_product_as_template[(new_base_product, product)] = new_denominator insert_doc = strengths_map_for_insert.get((new_base_product, product), {}) insert_doc["denominator"] = new_denominator strengths_map_for_insert[(new_base_product, product)] = insert_doc # This product as BASE PRODUCT # Existing pairs with product as base product. if session_context.bidirectional_pp_strength_updates and denominator_diff != 0: product_product_strength_operands = session_context.data_proxy.fetch_product_product_strength_operands( products=[product] ) numerators_with_product_as_base = product_product_strength_operands[0] denominators_with_product_as_base = product_product_strength_operands[1] for product_and_template in denominators_with_product_as_base: # updates the denominator... denominator = denominators_with_product_as_base[product_and_template] new_denominator = denominator + denominator_diff denominators_with_product_as_base[product_and_template] = new_denominator update_doc = strengths_map_for_update.get(product_and_template, {}) update_doc["denominator"] = new_denominator strengths_map_for_update[product_and_template] = update_doc # ...and the numerator, in case the template product has been consumed by this user if ( product_and_template[1] in products_rated_conservatively_high and product_and_template in numerators_with_product_as_base ): numerator_tuple = numerators_with_product_as_base[product_and_template] numerator_tuple[CONSERVATIVE] += denominator_diff if product_and_template[1] in products_rated_aggressively_high: numerator_tuple[AGGRESSIVE] += denominator_diff numerators_with_product_as_base[product_and_template] = numerator_tuple update_doc = strengths_map_for_update.get(product_and_template, {}) update_doc["nc"] = numerator_tuple[CONSERVATIVE] update_doc["na"] = numerator_tuple[AGGRESSIVE] strengths_map_for_update[product_and_template] = update_doc # New pairs with product as base product. if session_context.bidirectional_pp_strength_updates and denominator_diff == 1: # if this product has *just* been rated at least conservatively high... new_templates = [] for template in products_rated_conservatively_high: if template != product and (product, template) not in denominators_with_product_as_base: # new pair new_templates += [template] if len(new_templates) > 0: users_of_product_as_base = session_context.data_proxy.fetch_users_by_rating_by_product( product_ids=[product], min_date=cutoff_date, max_date=session_context.get_present_date() )[0].get(product, {}) # Includes the user of the current activity (remember again: this activity might not have been saved yet) users_set = users_of_product_as_base.get(rating, set()) users_set.add(user) users_of_product_as_base[rating] = users_set recommending_users_of_product_as_base = set() for r in range(session_context.min_rating_recommendable_from_product, 6): recommending_users_of_product_as_base |= users_of_product_as_base.get(r, set()) if session_context.impressions_enabled: user_impressions_by_template = session_context.data_proxy.fetch_impressions_summary( product_ids=new_templates, user_ids=list(recommending_users_of_product_as_base), group_by_product=True, anonymous=False, ) for new_template in new_templates: if session_context.impressions_enabled: new_denominator = len(user_impressions_by_template.get(new_template, [])) else: new_denominator = len(recommending_users_of_product_as_base) denominators_with_product_as_base[(product, new_template)] = new_denominator insert_doc = strengths_map_for_insert.get((product, new_template), {}) insert_doc["denominator"] = new_denominator strengths_map_for_insert[(product, new_template)] = insert_doc for new_template in new_templates: if new_template in products_rated_conservatively_high: numerator_tuple = numerators_with_product_as_base.get((product, new_template), [0, 0]) numerator_tuple[CONSERVATIVE] += 1 if new_template in products_rated_aggressively_high: numerator_tuple[AGGRESSIVE] += 1 numerators_with_product_as_base[(product, new_template)] = numerator_tuple insert_doc = strengths_map_for_insert.get((product, new_template), {}) insert_doc["nc"] = numerator_tuple[CONSERVATIVE] insert_doc["na"] = numerator_tuple[AGGRESSIVE] strengths_map_for_insert[(product, new_template)] = insert_doc # Computes all affected strengths for UPDATE if len(strengths_map_for_update) > 0: _prepare_strengths_map( session_context, product, strengths_map_for_update, numerators_with_product_as_base, denominators_with_product_as_base, numerators_with_product_as_template, denominators_with_product_as_template, ) log.info("Saving product-product strengths (UPDATE)...") session_context.data_proxy.save_pp_strengths(strengths_map_for_update, upsert=True) log.info("[{0}] product-product strengths updated".format(len(strengths_map_for_update))) else: log.info("No old strengths to update.") # Computes all affected strengths for INSERT if len(strengths_map_for_insert) > 0: _prepare_strengths_map( session_context, product, strengths_map_for_insert, numerators_with_product_as_base, denominators_with_product_as_base, numerators_with_product_as_template, denominators_with_product_as_template, ) log.info("Saving product-product strengths (INSERT)...") session_context.data_proxy.save_pp_strengths(strengths_map_for_insert, upsert=False) log.info("[{0}] product-product strengths inserted".format(len(strengths_map_for_insert))) else: log.info("No new strengths to insert.") # Consolidates cached product templates log.info("Determining products whose templates must be consolidated...") products_to_consolidate = {product_and_template[0] for product_and_template in strengths_map_for_insert} updated_products = {product_and_template[0] for product_and_template in strengths_map_for_update} old_templates_map = session_context.data_proxy.fetch_product_templates(list(updated_products)) for product_and_template, strength_doc in strengths_map_for_update.items(): base_product = product_and_template[0] template_product = product_and_template[1] cutoff_strength = 0 old_template_ids = set() old_templates = old_templates_map.get(base_product) if old_templates: old_templates_collaborative = old_templates[0] if old_templates_collaborative: cutoff_strength = old_templates_collaborative[-1][0] # the strength of the weakest template if isinstance(cutoff_strength, str): cutoff_strength = 0 old_template_ids = {t[1] for t in old_templates_collaborative} if ( strength_doc["strength"] > cutoff_strength or template_product in old_template_ids or len(old_template_ids) < 3 * session_context.product_templates_count ): products_to_consolidate.add(base_product) if session_context.should_consolidate_product_templates_on_the_fly: if len(products_to_consolidate) > 0: log.info("Consolidating templates of %d products..." % len(products_to_consolidate)) consolidate_product_templates( session_context, products_list=list(products_to_consolidate), collaborative=True, tfidf=False ) else: log.info("No products with templates to consolidate.") session_context.data_proxy.save_latest_activity_for_product_product_strengths( user, product, activity_type, activity_date ) log.info("PP strengths and templates updated successfully.")
def __init__(self, session_context, user_id, context_filter=None, algorithm=None): super().__init__() if session_context is None: raise AttributeError("Session context cannot be None") if user_id is None: raise AttributeError("User ID cannot be None") if user_id.__hash__ is None: raise TypeError("User ID must be hashable") self.session_context = session_context """ The customer context associated to this user """ self.user_id = user_id """ The id of the target user. """ self.is_anonymous = config.is_anonymous(self.user_id) """ Indicates whether the target user is 'anonymous' (identified only by her cookies or something). """ self.filter = context_filter """ The user context filter, to be applied when filtering the possible recommendations """ self.algorithm = algorithm """ The algorithm that will be used for recommending products throughout this session. """ self.specialist_recommenders = set() """ A list with the suffixes of all specialist recommenders employed by this session's recommendation algorithm. """ self.user_impressions_summary = None """ A map {product: (count, first_impression_date)} summarizing the impressions of the target user. """ self.user_templates = None """ The user templates of the target user. """ self.recent_activities = [] """ A list of {"external_product_id": product_id, "activity": activity_type, "created_at": datetime} dicts with all recent activities of the target user in descending order of dates. """ self.recent_activities_by_product = None """ A dict {product: list of (date, activity_type) tuples in descending order of dates} corresponding to the latest activities of the target user. """ self.recent_activities_by_template_user = None """ A map {user_id: list of {"external_product_id": product_id, "activity": activity_type, "created_at": datetime} dicts with all recent activities of each user in descending order of dates}. The users whose recent activities are pre-fetched here are those in self.user_templates. """ self.recent_activities_by_product_by_template_user = None """ A map {user_id: {product: list of (date, activity_type) tuples in descending order of dates}}. The users whose recent activities are pre-fetched here are those in self.user_templates. """ self.blocked_products = None """ A set of products which shall not be recommended (any further) owing to previous consumption activities of the target. """ self.filtered_products = None """ A set of product ids corresponding to products which passes the session filter. When the filtering strategy is BEFORE_SCORING, these products are determined during the session initialization. When the strategy is AFTER_SCORING, these products are determined a posteriori, when the recommender calls self.apply_pos_filter_to_products() passing the intended set of products to be filtered. """ self.product_models = {} """ A map {product_id: ProductModel instance} with the product models for: - pre-filtered products, when the filtering strategy is BEFORE_SCORING; - products recently consumed by the target user; - products recently consumed by template users of the target user. """ self.most_recently_consumed_products = None """ A list with the target user's recently consumed product ids, in descending order of consumption. """ self._determine_specialist_recommenders() self.refresh()
def populate_impressions(context): """ Creates dummy impressions for all user-product pairs. """ date = context.get_present_date() - dt.timedelta(days=5) all_users = context.data_proxy.fetch_all_user_ids() for user in all_users: all_products = context.data_proxy.fetch_all_product_ids() # It must be here, since generators can't be rewound. for product in all_products: context.data_proxy.increment_impression_summary(user, product, date, anonymous=config.is_anonymous(user))
def test_near_identical(self): """ Tests that two products considered 'near-identical' are not recommended at the same time (within the same page) when the filtering strategy is AFTER_SCORING. """ target = "u_tec_1" id_twin_product_1 = "p_tec_TWIN_1" id_twin_product_2 = "p_tec_TWIN_2" date = self.session_context.get_present_date() - dt.timedelta(days=1) twin_product_1 = {"external_id": id_twin_product_1, "language": "english", "date": date, "expiration_date": date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} twin_product_2 = {"external_id": id_twin_product_2, "language": "english", "date": date, "expiration_date": date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through This Night is Alright"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} self.db_proxy.insert_product(twin_product_1) tasks.process_product(self.session_context, id_twin_product_1) self.db_proxy.insert_product(twin_product_2) tasks.process_product(self.session_context, id_twin_product_2) # makes it so that all users consume (and have impressions on) the twins, except for the target user users = self.db_proxy.fetch_all_user_ids() for user in users: if user != target: activity = {"external_user_id": user, "external_product_id": id_twin_product_1, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) activity = {"external_user_id": user, "external_product_id": id_twin_product_2, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) if self.session_context.impressions_enabled: is_anonymous = config.is_anonymous(user) self.db_proxy.increment_impression_summary(user, id_twin_product_1, date=self.session_context.get_present_date(), anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user, id_twin_product_2, date=self.session_context.get_present_date(), anonymous=is_anonymous) ut.generate_templates(self.session_context) pt.generate_templates(self.session_context) pttfidf.generate_templates(self.session_context) # Unfortunately we need to regenerate from scratch, # otherwise the df's of the twins will be different. # First, we recommend WITHOUT near-identical filtering, to check that the twins really appear consecutively. custom_settings = {'near_identical_filter_field': None, 'near_identical_filter_threshold': None} session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() if not recommender.is_hybrid(): # For hybrid recommenders, this check is meaningless. recommendations = recommender.recommend(100) twin_index = -1 for idx, recommendation in enumerate(recommendations): if recommendation[1].startswith("p_tec_TWIN_"): if twin_index >= 0: nose.tools.eq_(idx - twin_index, 1, "The two near-identical products should appear consecutively without filtering") break twin_index = idx # Now we recommend WITH near-identical filtering recommendation_page_size = 5 custom_settings = {'near_identical_filter_field': 'resources.title', 'near_identical_filter_threshold': 2, 'recommendations_page_size': recommendation_page_size} session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() recommendations = recommender.recommend(100) # Sanity check recommended_products = {r[1] for r in recommendations} count_recommended_twins = len({id_twin_product_1, id_twin_product_2} & recommended_products) nose.tools.ok_(count_recommended_twins > 0, "At least one of the twins should have been recommended, otherwise the test is meaningless") # Actual tests twin_index = -1 * recommendation_page_size - 1 # initial value, so the first twin passes the test for idx, recommendation in enumerate(recommendations): if recommendation[1].startswith("p_tec_TWIN_"): nose.tools.ok_(idx - twin_index > 1, # it suffices to show that the twins have been separated "Two near-identical products should not appear within the same recommendations page") twin_index = idx
def test_product_age_decay_exponential(self): """ Tests the effect of applying a product age decay factor based on an exponential function on recommendations. It applies to all recommendation heuristics. """ target = "u_tec_1" id_twin_product_old = "p_tec_TWIN_OLD" id_twin_product_new = "p_tec_TWIN_NEW" # makes it so that the oldest twin is 2 days (the configured half life) older old_date = self.session_context.get_present_date() - dt.timedelta(days=2) new_date = self.session_context.get_present_date() twin_product_old = {"external_id": id_twin_product_old, "language": "english", "date": old_date, "expiration_date": old_date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} twin_product_new = {"external_id": id_twin_product_new, "language": "english", "date": new_date, "expiration_date": new_date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} self.db_proxy.insert_product(twin_product_old) tasks.process_product(self.session_context, id_twin_product_old) self.db_proxy.insert_product(twin_product_new) tasks.process_product(self.session_context, id_twin_product_new) # makes it so that all users consume (and have impressions on) the twins, except for the target user users = self.db_proxy.fetch_all_user_ids() for user in users: if user != target: activity = {"external_user_id": user, "external_product_id": id_twin_product_old, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) activity = {"external_user_id": user, "external_product_id": id_twin_product_new, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) if self.session_context.impressions_enabled: is_anonymous = config.is_anonymous(user) self.db_proxy.increment_impression_summary(user, id_twin_product_old, date=self.session_context.get_present_date(), anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user, id_twin_product_new, date=self.session_context.get_present_date(), anonymous=is_anonymous) ut.generate_templates(self.session_context) pt.generate_templates(self.session_context) pttfidf.generate_templates(self.session_context) # Unfortunately we need to regenerate from scratch, # otherwise the df's of the twins will be different. custom_settings = {'product_age_decay_function_name': 'exponential', 'product_age_decay_exponential_function_halflife': 2, 'near_identical_filter_field': None, 'near_identical_filter_threshold': None} # Disables near-identical filtering session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() # Determines the index of the first actual value in the score tuples # produced by the recommender (note that hybrid recommenders use the first # position to indicate the algorithm number) if recommender.is_hybrid(): start_index = 1 else: start_index = 0 recommendations = recommender.recommend(100) nose.tools.ok_(len(recommendations) > 0, "No recommendations were returned!") strength_old_twin = None strength_new_twin = None for rec in recommendations: if rec[1] == id_twin_product_old: strength_old_twin = rec[0] if rec[1] == id_twin_product_new: strength_new_twin = rec[0] for i in range(start_index, len(strength_old_twin)): old_strength_value = strength_old_twin[i] new_strength_value = strength_new_twin[i] nose.tools.ok_(abs(old_strength_value / new_strength_value - 0.5) < tests.FLOAT_DELTA, "Incorrect application of the product age decay")