Пример #1
0
 def gather_candidate_products(self, n_recommendations):
     product_ids_set = set()
     if not config.is_anonymous(self.session_context.user_id):
         for strength, template_id in self.session_context.user_templates:
             template_user_activities = self.session_context.recent_activities_by_template_user.get(template_id, {})
             product_ids = {act["external_product_id"] for act in template_user_activities}
             product_ids_set |= product_ids
     return {self.get_suffix(): product_ids_set}
Пример #2
0
def update_summaries(session_context, activity):
    """ Performs the following updates:
    
        1) If the informed activity is the first popularity-defining activity for that (user, product) pair,
           increments the popularity of the given product, and sets the popularity flag of that (user, pair) to True
           in the activities_summary.
           
        2) Resets the impressions count for that (user, product) pair.
        
        3) Updates the activities summary for that (user, product) pair.
    
        :param session_context: The session context. 
        :param activity: The activity being processed, i.e., the trigger to the updates.
    """
    user = activity["external_user_id"]
    is_anonymous = config.is_anonymous(user)
    product = activity["external_product_id"]
    act_type = activity["activity"]
    act_rating = session_context.rating_by_activity.get(act_type)
    date = activity["created_at"]

    log.info("Processing summaries [user=%s, product=%s, type=%s]..." % (user, product, act_type))
    start = time()

    u_p_activity_summary = _get_current_user_product_summary(session_context, user, product, anonymous=is_anonymous)

    # Product popularity (if need be)

    should_increment_popularity = act_rating >= session_context.min_rating_recommendable_from_user
    if u_p_activity_summary is not None and u_p_activity_summary["contributed_for_popularity"]:
        should_increment_popularity = False
    log.info("Updating product {0} popularity...".format(product))
    session_context.data_proxy.update_product_popularity(
        product, date, should_increment_popularity)

    # Impressions summary

    log.info("Resetting impressions for user/product pair ({0}, {1})...".format(user, product))
    session_context.data_proxy.reset_impression_summary(
        user, product, anonymous=is_anonymous)

    # Activities summary

    log.info("Updating activities summary for user/product pair ({0}, {1})...".format(user, product))
    session_context.data_proxy.save_activity_summary(
        activity, set_popularity_flag=should_increment_popularity, anonymous=is_anonymous)

    log.info("---Done processing summaries [user=%s, product=%s, type=%s] (took %.6f seconds)"
             % (user, product, act_type, time() - start))
Пример #3
0
    def test_user_user_strengths_incremental_with_new_impressions_random(self):
        """ Tests whether the user x user strengths generated on a step-by-step basis
            match exactly those created from scratch.
            This test saves several random activities in a row,
            checking whether all strengths were correctly updated.
        """
        if not tests.INCLUDE_RANDOM_TESTS:
            return

        all_users = [u for u in self.db_proxy.fetch_all_user_ids()]
        all_products = [p for p in self.db_proxy.fetch_all_product_ids()]

        for i in range(100):
            user = random.choice(all_users)
            is_anonymous = config.is_anonymous(user)

            print("user: %s" % user)

            # Saves a couple of impressions for the chosen user
            date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00")) + dt.timedelta(seconds=2 * i)
            product1 = random.choice(all_products)
            product2 = random.choice(all_products)
            self.db_proxy.increment_impression_summary(user_id=user, product_id=product1,
                                                       date=date, anonymous=is_anonymous)
            self.db_proxy.increment_impression_summary(user_id=user, product_id=product2,
                                                       date=date, anonymous=is_anonymous)

            print("impressions --> %s, %s" % (product1, product2))

            ut.generate_templates(self.session_context)
            # it is important to regenerate from scratch (with all new impressions)

            # Saves one activity for that same user
            product3 = random.choice(all_products)
            activity_type = random.choice(self.session_context.supported_activities)
            date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00")) + dt.timedelta(seconds=2 * i + 1)

            activity = {"external_user_id": user,
                        "external_product_id": product3,
                        "activity": activity_type,
                        "created_at": date}

            print("activity --> " + str(activity))

            ut.update_templates(self.session_context, activity)
            tasks.update_summaries(self.session_context, activity)

            self.compare_incremental_vs_from_scratch(
                target_users=[user] if self.session_context.impressions_enabled else None)
Пример #4
0
def process_impression(session_context, impression):
    user = impression["external_user_id"]
    is_anonymous = config.is_anonymous(user)
    product = impression["external_product_id"]
    date = impression["created_at"]

    log.info("Processing impression [user=%s, product=%s]..." % (user, product))
    start = time()
    try:
        session_context.data_proxy.increment_impression_summary(user, product, date, anonymous=is_anonymous)
        log.info("---Done processing impression [user=%s, product=%s] (took %.6f seconds)"
                 % (user, product, time() - start))
    except Exception as ex:
        log.error("Error while processing impression [user=%s, product=%s], message=%s, stack_trace=%s"
                  % (user, product, ex.args[0], traceback.format_exc()))
Пример #5
0
    def gather_recommendation_scores(self, candidate_product_ids_by_algorithm, n_recommendations):
        scores_by_recommendation_candidate = {}

        if not config.is_anonymous(self.session_context.user_id):
            candidates = self.pick_candidate_products(candidate_product_ids_by_algorithm)

            for strength, template_id in self.session_context.user_templates:
                template_user_activities = self.session_context.recent_activities_by_template_user.get(template_id, {})
                products = {
                    act["external_product_id"]
                    for act in template_user_activities
                    if self.session_context.rating_by_activity[act["activity"]]
                    >= self.session_context.min_rating_recommendable_from_user
                }
                for product_id in products:
                    if product_id in candidates:
                        score = scores_by_recommendation_candidate.get(product_id, [[0.0], product_id])
                        score_increment = self.calculate_score(strength, product_id, template_id)
                        if score_increment != 0:
                            score[0][0] += score_increment
                            scores_by_recommendation_candidate[product_id] = score

        return scores_by_recommendation_candidate.values()
Пример #6
0
def update_collaborative_filtering_strengths(session_context, activity):
    """ Updates user-user strengths and product-product strengths in conformity to the informed activity.

        :param session_context: The session context.
        :param activity: The activity which triggered the updates.
    """
    user = activity["external_user_id"]
    is_anonymous = config.is_anonymous(user)
    if is_anonymous:
        return  # we do NOT want anonymous users to influence collaborative filtering strengths!

    product = activity["external_product_id"]
    act_type = activity["activity"]

    log.info("Processing strengths [user=%s, product=%s, type=%s]..." % (user, product, act_type))
    start = time()

    u_p_activity_summary = _get_current_user_product_summary(session_context, user, product, anonymous=False)

    first_impression_date = None
    if session_context.impressions_enabled:
        product_user_impressions_summary = session_context.data_proxy.fetch_impressions_summary(
            product_ids=[product],
            user_ids=[user],
            group_by_product=True,
            anonymous=False).get(product, {}).get(user, (0, None))
        first_impression_date = product_user_impressions_summary[1]

    log.info("Updating user-user strengths affected by user/product pair ({0}, {1})...".format(user, product))
    ut.update_templates(session_context, activity, u_p_activity_summary, first_impression_date,
                        should_lookup_activities_summary=False, should_lookup_first_impression=False)
    log.info("Updating product-product strengths affected by user/product pair ({0}, {1})...".format(user, product))
    pt.update_templates(session_context, activity, u_p_activity_summary, first_impression_date,
                        should_lookup_activities_summary=False, should_lookup_first_impression=False)

    log.info("---Done processing strengths [user=%s, product=%s, type=%s] (took %.6f seconds)"
             % (user, product, act_type, time() - start))
Пример #7
0
def update_templates(
    session_context,
    new_activity,
    u_p_activities_summary=None,
    first_impression_date=None,
    should_lookup_activities_summary=True,
    should_lookup_first_impression=True,
):
    """ Updates product x product strengths based on a single new activity.

        :param session_context: The session context.
        :param new_activity: a dict {"external_user_id": user_id,
                                     "external_product_id": product_id,
                                     "activity": activity_type,
                                     "created_at": datetime}.
        :param u_p_activities_summary: The summary of activities for that (user, product) pair, if any,
            in the form of a dict {"external_user_id": the user id,
                                   "external_product_id": the product id,
                                   "activity": the latest activity type,
                                   "created_at": the datetime of the latest activity,
                                   "pp_latest_type": the type of the latest activity to be processed for
                                                     that pair during p-p strengths calculation,
                                   "pp_latest_date": the date of the latest activity to be processed for
                                                     that pair during p-p strengths calculation}.
        :param first_impression_date: The date of the first impression, if any, the activity user has received on
            the activity product.
        :param should_lookup_activities_summary: If True and previous_activity is None, it queries the database
            for the previous activity.
        :param should_lookup_first_impression: If True and first_impression_date is None, it queries the database
            for the first impression.
    """
    log.info("Computing product-product strengths...")

    user = new_activity["external_user_id"]
    if config.is_anonymous(user):
        log.info("Anonymous users should not affect product-product strengths! Exiting now.")
        return

    product = new_activity["external_product_id"]
    activity_date = new_activity["created_at"]
    activity_type = new_activity["activity"]
    rating = session_context.rating_by_activity.get(activity_type)
    if rating is None:
        log.error("Unsupported activity type: %s" % activity_type)
        return

    suggested_cutoff_date = session_context.get_present_date() - dt.timedelta(
        session_context.product_product_strengths_window
    )
    latest_batch_info = session_context.data_proxy.fetch_latest_batch_info_product_product_strengths()
    if latest_batch_info is not None:
        latest_batch_timestamp = latest_batch_info["timestamp"]
        persisted_cutoff_date = latest_batch_info.get("cutoff_date")
        if persisted_cutoff_date is None:
            cutoff_date = suggested_cutoff_date
        else:
            cutoff_date = max(persisted_cutoff_date, suggested_cutoff_date)
    else:
        latest_batch_timestamp = None
        cutoff_date = suggested_cutoff_date

    if session_context.impressions_enabled and first_impression_date is None and should_lookup_first_impression:
        product_user_impressions_summary = (
            session_context.data_proxy.fetch_impressions_summary(
                product_ids=[product], user_ids=[user], group_by_product=True, anonymous=False
            )
            .get(product, {})
            .get(user, (0, None))
        )
        first_impression_date = product_user_impressions_summary[1]

    if u_p_activities_summary is None and should_lookup_activities_summary:
        u_p_activities_summary_as_singleton_list = session_context.data_proxy.fetch_activity_summaries_by_user(
            user_ids=[user], product_ids=[product], indexed_fields_only=False, anonymous=False
        ).get(user, [])
        if len(u_p_activities_summary_as_singleton_list) > 0:
            u_p_activities_summary = u_p_activities_summary_as_singleton_list[0]

    previous_activity_rating = 0
    if u_p_activities_summary is not None:
        previous_activity_type = u_p_activities_summary.get("pp_latest_type")
        if previous_activity_type is not None:
            previous_activity_rating = session_context.rating_by_activity[previous_activity_type]
            previous_activity_date = u_p_activities_summary["pp_latest_date"]

    if previous_activity_rating == rating and not session_context.impressions_enabled:
        return  # repeating the latest activity --- there is nothing to do here
        # (if using impressions, must recalculate anyway to account for latest impressions)

    numerator_diff = [0, 0]
    denominator_diff = 0

    remove_previous_activity_contribution = previous_activity_rating >= min(
        session_context.min_rating_conservative, session_context.min_rating_recommendable_from_product
    )
    if remove_previous_activity_contribution:
        if session_context.impressions_enabled:
            if first_impression_date is not None:
                # must remove former contribution if impression was already processed incrementally
                remove_previous_activity_contribution = previous_activity_date >= first_impression_date
                # must remove also if generation from scratch happened after the first impression
                if not remove_previous_activity_contribution and latest_batch_timestamp is not None:
                    remove_previous_activity_contribution = latest_batch_timestamp >= first_impression_date

    # Removes the former contribution of the previous commanding activity for that (user, product) pair.
    if remove_previous_activity_contribution:
        if previous_activity_rating >= session_context.min_rating_conservative:
            numerator_diff[CONSERVATIVE] -= 1
        if previous_activity_rating >= session_context.min_rating_aggressive:
            numerator_diff[AGGRESSIVE] -= 1
        if previous_activity_rating >= session_context.min_rating_recommendable_from_product:
            denominator_diff -= 1

    # Adds the contribution of this activity.
    if rating >= session_context.min_rating_conservative:
        numerator_diff[CONSERVATIVE] += 1
    if rating >= session_context.min_rating_aggressive:
        numerator_diff[AGGRESSIVE] += 1
    if rating >= session_context.min_rating_recommendable_from_product:
        denominator_diff += 1

    # Fetches all the products consumed by this user.
    products_by_rating = session_context.data_proxy.fetch_products_by_rating_by_user(
        user_ids=[user], min_date=cutoff_date, max_date=session_context.get_present_date()
    )[0].get(user, {})

    # Includes the product of the current activity (remember: this activity might not have been saved yet)
    products_set = products_by_rating.get(rating, set())
    products_set.add(product)
    products_by_rating[rating] = products_set
    if u_p_activities_summary is not None:
        products_set = products_by_rating.get(previous_activity_rating, set())
        if product in products_set:
            products_set.remove(product)
            products_by_rating[previous_activity_rating] = products_set

    products_rated_conservatively_high = set()
    for r in range(session_context.min_rating_conservative, 6):
        products_rated_conservatively_high |= products_by_rating.get(r, set())
    products_rated_aggressively_high = set()
    for r in range(session_context.min_rating_aggressive, 6):
        products_rated_aggressively_high |= products_by_rating.get(r, set())
    products_rated_sufficiently_for_recommendation = set()
    for r in range(session_context.min_rating_recommendable_from_product, 6):
        products_rated_sufficiently_for_recommendation |= products_by_rating.get(r, set())

    numerators_with_product_as_template = None
    denominators_with_product_as_template = None
    numerators_with_product_as_base = None
    denominators_with_product_as_base = None
    strengths_map_for_insert = {}
    strengths_map_for_update = {}

    # This product as TEMPLATE

    # If this product has been consumed by this user without previous impressions, then it shall not contribute
    # for product-product strengths with this product as template.
    update_product_as_template = True
    if session_context.impressions_enabled:
        update_product_as_template = first_impression_date is not None

    # Existing pairs with product as template.

    if update_product_as_template and numerator_diff != [0, 0]:

        strength_operands_with_product_as_template = session_context.data_proxy.fetch_product_product_strength_operands(
            templates=[product]
        )
        numerators_with_product_as_template = strength_operands_with_product_as_template[0]
        denominators_with_product_as_template = strength_operands_with_product_as_template[1]

        for product_and_template, numerator_tuple in numerators_with_product_as_template.items():
            base_product = product_and_template[0]
            if base_product in products_rated_sufficiently_for_recommendation:
                new_numerator_tuple = [numerator_tuple[0] + numerator_diff[0], numerator_tuple[1] + numerator_diff[1]]
                numerators_with_product_as_template[product_and_template] = new_numerator_tuple
                update_doc = strengths_map_for_update.get(product_and_template, {})
                update_doc["nc"] = new_numerator_tuple[CONSERVATIVE]
                update_doc["na"] = new_numerator_tuple[AGGRESSIVE]
                strengths_map_for_update[product_and_template] = update_doc

    # New pairs with product as template.

    if update_product_as_template and numerator_diff[0] == 1:  # if this user has *just* rated this product high...
        new_base_products = []
        for base_product in products_rated_sufficiently_for_recommendation:
            if base_product != product and (base_product, product) not in numerators_with_product_as_template:
                new_base_products += [base_product]
                new_numerator_tuple = [
                    1 if rating >= session_context.min_rating_conservative else 0,
                    1 if rating >= session_context.min_rating_aggressive else 0,
                ]
                numerators_with_product_as_template[(base_product, product)] = new_numerator_tuple
                update_doc = strengths_map_for_insert.get((base_product, product), {})
                update_doc["nc"] = new_numerator_tuple[CONSERVATIVE]
                update_doc["na"] = new_numerator_tuple[AGGRESSIVE]
                strengths_map_for_insert[(base_product, product)] = update_doc

        users_by_rating_by_new_base_product = session_context.data_proxy.fetch_users_by_rating_by_product(
            product_ids=new_base_products, min_date=cutoff_date, max_date=session_context.get_present_date()
        )[0]

        for new_base_product in new_base_products:
            source_users = set()
            for r in range(session_context.min_rating_recommendable_from_product, 6):
                source_users |= users_by_rating_by_new_base_product[new_base_product][r]
            if session_context.impressions_enabled:
                # Retrieves the intersection of the top-rated users of the base product
                # with the users with impressions for the template product
                source_users_with_impressions = session_context.data_proxy.fetch_users_with_impressions_by_product(
                    product_ids=[product], user_ids=list(source_users), anonymous=False
                ).get(product, set())
                new_denominator = len(source_users_with_impressions)
            else:
                new_denominator = len(source_users)
            denominators_with_product_as_template[(new_base_product, product)] = new_denominator
            insert_doc = strengths_map_for_insert.get((new_base_product, product), {})
            insert_doc["denominator"] = new_denominator
            strengths_map_for_insert[(new_base_product, product)] = insert_doc

    # This product as BASE PRODUCT

    # Existing pairs with product as base product.

    if session_context.bidirectional_pp_strength_updates and denominator_diff != 0:
        product_product_strength_operands = session_context.data_proxy.fetch_product_product_strength_operands(
            products=[product]
        )
        numerators_with_product_as_base = product_product_strength_operands[0]
        denominators_with_product_as_base = product_product_strength_operands[1]

        for product_and_template in denominators_with_product_as_base:
            # updates the denominator...
            denominator = denominators_with_product_as_base[product_and_template]
            new_denominator = denominator + denominator_diff
            denominators_with_product_as_base[product_and_template] = new_denominator
            update_doc = strengths_map_for_update.get(product_and_template, {})
            update_doc["denominator"] = new_denominator
            strengths_map_for_update[product_and_template] = update_doc

            # ...and the numerator, in case the template product has been consumed by this user
            if (
                product_and_template[1] in products_rated_conservatively_high
                and product_and_template in numerators_with_product_as_base
            ):
                numerator_tuple = numerators_with_product_as_base[product_and_template]
                numerator_tuple[CONSERVATIVE] += denominator_diff
                if product_and_template[1] in products_rated_aggressively_high:
                    numerator_tuple[AGGRESSIVE] += denominator_diff
                numerators_with_product_as_base[product_and_template] = numerator_tuple
                update_doc = strengths_map_for_update.get(product_and_template, {})
                update_doc["nc"] = numerator_tuple[CONSERVATIVE]
                update_doc["na"] = numerator_tuple[AGGRESSIVE]
                strengths_map_for_update[product_and_template] = update_doc

    # New pairs with product as base product.

    if session_context.bidirectional_pp_strength_updates and denominator_diff == 1:
        # if this product has *just* been rated at least conservatively high...
        new_templates = []
        for template in products_rated_conservatively_high:
            if template != product and (product, template) not in denominators_with_product_as_base:  # new pair
                new_templates += [template]

        if len(new_templates) > 0:
            users_of_product_as_base = session_context.data_proxy.fetch_users_by_rating_by_product(
                product_ids=[product], min_date=cutoff_date, max_date=session_context.get_present_date()
            )[0].get(product, {})
            # Includes the user of the current activity (remember again: this activity might not have been saved yet)
            users_set = users_of_product_as_base.get(rating, set())
            users_set.add(user)
            users_of_product_as_base[rating] = users_set

            recommending_users_of_product_as_base = set()
            for r in range(session_context.min_rating_recommendable_from_product, 6):
                recommending_users_of_product_as_base |= users_of_product_as_base.get(r, set())

            if session_context.impressions_enabled:
                user_impressions_by_template = session_context.data_proxy.fetch_impressions_summary(
                    product_ids=new_templates,
                    user_ids=list(recommending_users_of_product_as_base),
                    group_by_product=True,
                    anonymous=False,
                )

            for new_template in new_templates:
                if session_context.impressions_enabled:
                    new_denominator = len(user_impressions_by_template.get(new_template, []))
                else:
                    new_denominator = len(recommending_users_of_product_as_base)
                denominators_with_product_as_base[(product, new_template)] = new_denominator
                insert_doc = strengths_map_for_insert.get((product, new_template), {})
                insert_doc["denominator"] = new_denominator
                strengths_map_for_insert[(product, new_template)] = insert_doc

            for new_template in new_templates:
                if new_template in products_rated_conservatively_high:
                    numerator_tuple = numerators_with_product_as_base.get((product, new_template), [0, 0])
                    numerator_tuple[CONSERVATIVE] += 1
                    if new_template in products_rated_aggressively_high:
                        numerator_tuple[AGGRESSIVE] += 1
                    numerators_with_product_as_base[(product, new_template)] = numerator_tuple
                    insert_doc = strengths_map_for_insert.get((product, new_template), {})
                    insert_doc["nc"] = numerator_tuple[CONSERVATIVE]
                    insert_doc["na"] = numerator_tuple[AGGRESSIVE]
                    strengths_map_for_insert[(product, new_template)] = insert_doc

    # Computes all affected strengths for UPDATE

    if len(strengths_map_for_update) > 0:
        _prepare_strengths_map(
            session_context,
            product,
            strengths_map_for_update,
            numerators_with_product_as_base,
            denominators_with_product_as_base,
            numerators_with_product_as_template,
            denominators_with_product_as_template,
        )

        log.info("Saving product-product strengths (UPDATE)...")
        session_context.data_proxy.save_pp_strengths(strengths_map_for_update, upsert=True)
        log.info("[{0}] product-product strengths updated".format(len(strengths_map_for_update)))
    else:
        log.info("No old strengths to update.")

    # Computes all affected strengths for INSERT

    if len(strengths_map_for_insert) > 0:
        _prepare_strengths_map(
            session_context,
            product,
            strengths_map_for_insert,
            numerators_with_product_as_base,
            denominators_with_product_as_base,
            numerators_with_product_as_template,
            denominators_with_product_as_template,
        )

        log.info("Saving product-product strengths (INSERT)...")
        session_context.data_proxy.save_pp_strengths(strengths_map_for_insert, upsert=False)
        log.info("[{0}] product-product strengths inserted".format(len(strengths_map_for_insert)))
    else:
        log.info("No new strengths to insert.")

    # Consolidates cached product templates

    log.info("Determining products whose templates must be consolidated...")
    products_to_consolidate = {product_and_template[0] for product_and_template in strengths_map_for_insert}

    updated_products = {product_and_template[0] for product_and_template in strengths_map_for_update}
    old_templates_map = session_context.data_proxy.fetch_product_templates(list(updated_products))
    for product_and_template, strength_doc in strengths_map_for_update.items():
        base_product = product_and_template[0]
        template_product = product_and_template[1]
        cutoff_strength = 0
        old_template_ids = set()
        old_templates = old_templates_map.get(base_product)
        if old_templates:
            old_templates_collaborative = old_templates[0]
            if old_templates_collaborative:
                cutoff_strength = old_templates_collaborative[-1][0]  # the strength of the weakest template
                if isinstance(cutoff_strength, str):
                    cutoff_strength = 0
                old_template_ids = {t[1] for t in old_templates_collaborative}
        if (
            strength_doc["strength"] > cutoff_strength
            or template_product in old_template_ids
            or len(old_template_ids) < 3 * session_context.product_templates_count
        ):
            products_to_consolidate.add(base_product)

    if session_context.should_consolidate_product_templates_on_the_fly:
        if len(products_to_consolidate) > 0:
            log.info("Consolidating templates of %d products..." % len(products_to_consolidate))
            consolidate_product_templates(
                session_context, products_list=list(products_to_consolidate), collaborative=True, tfidf=False
            )
        else:
            log.info("No products with templates to consolidate.")

    session_context.data_proxy.save_latest_activity_for_product_product_strengths(
        user, product, activity_type, activity_date
    )

    log.info("PP strengths and templates updated successfully.")
Пример #8
0
    def __init__(self, session_context, user_id, context_filter=None, algorithm=None):
        super().__init__()

        if session_context is None:
            raise AttributeError("Session context cannot be None")

        if user_id is None:
            raise AttributeError("User ID cannot be None")

        if user_id.__hash__ is None:
            raise TypeError("User ID must be hashable")

        self.session_context = session_context
        """ The customer context associated to this user
        """
        self.user_id = user_id
        """ The id of the target user.
        """
        self.is_anonymous = config.is_anonymous(self.user_id)
        """ Indicates whether the target user is 'anonymous' (identified only by her cookies or something).
        """
        self.filter = context_filter
        """ The user context filter, to be applied when filtering the possible recommendations
        """
        self.algorithm = algorithm
        """ The algorithm that will be used for recommending products throughout this session.
        """
        self.specialist_recommenders = set()
        """ A list with the suffixes of all specialist recommenders employed by this session's recommendation algorithm.
        """
        self.user_impressions_summary = None
        """ A map {product: (count, first_impression_date)} summarizing the impressions of the target user.
        """
        self.user_templates = None
        """ The user templates of the target user.
        """
        self.recent_activities = []
        """ A list of {"external_product_id": product_id, "activity": activity_type, "created_at": datetime} dicts
            with all recent activities of the target user in descending order of dates.
        """
        self.recent_activities_by_product = None
        """ A dict {product: list of (date, activity_type) tuples in descending order of dates} corresponding to the
            latest activities of the target user.
        """
        self.recent_activities_by_template_user = None
        """ A map {user_id: list of {"external_product_id": product_id, "activity": activity_type,
            "created_at": datetime} dicts with all recent activities of each user in descending order of dates}.
            The users whose recent activities are pre-fetched here are those in self.user_templates.
        """
        self.recent_activities_by_product_by_template_user = None
        """ A map {user_id: {product: list of (date, activity_type) tuples in descending order of dates}}.
            The users whose recent activities are pre-fetched here are those in self.user_templates.
        """
        self.blocked_products = None
        """ A set of products which shall not be recommended (any further)
            owing to previous consumption activities of the target.
        """
        self.filtered_products = None
        """ A set of product ids corresponding to products which passes the session filter.
            When the filtering strategy is BEFORE_SCORING, these products are determined during the session
            initialization. When the strategy is AFTER_SCORING, these products are determined a posteriori,
            when the recommender calls self.apply_pos_filter_to_products() passing the intended set of
            products to be filtered.
        """
        self.product_models = {}
        """ A map {product_id: ProductModel instance} with the product models for:
            - pre-filtered products, when the filtering strategy is BEFORE_SCORING;
            - products recently consumed by the target user;
            - products recently consumed by template users of the target user.
        """
        self.most_recently_consumed_products = None
        """ A list with the target user's recently consumed product ids, in descending order of consumption.
        """

        self._determine_specialist_recommenders()

        self.refresh()
Пример #9
0
def populate_impressions(context):
    """ Creates dummy impressions for all user-product pairs.
    """
    date = context.get_present_date() - dt.timedelta(days=5)

    all_users = context.data_proxy.fetch_all_user_ids()
    for user in all_users:
        all_products = context.data_proxy.fetch_all_product_ids()  # It must be here, since generators can't be rewound.
        for product in all_products:
            context.data_proxy.increment_impression_summary(user, product, date, anonymous=config.is_anonymous(user))
Пример #10
0
    def test_near_identical(self):
        """ Tests that two products considered 'near-identical' are not recommended at the same time
            (within the same page) when the filtering strategy is AFTER_SCORING.
        """
        target = "u_tec_1"

        id_twin_product_1 = "p_tec_TWIN_1"
        id_twin_product_2 = "p_tec_TWIN_2"

        date = self.session_context.get_present_date() - dt.timedelta(days=1)

        twin_product_1 = {"external_id": id_twin_product_1,
                          "language": "english",
                          "date": date,
                          "expiration_date": date + dt.timedelta(days=30),
                          "resources": {"title": "Whatever Gets You Through The Night"},
                          "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                          "category": "Nonsense"}

        twin_product_2 = {"external_id": id_twin_product_2,
                          "language": "english",
                          "date": date,
                          "expiration_date": date + dt.timedelta(days=30),
                          "resources": {"title": "Whatever Gets You Through This Night is Alright"},
                          "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                          "category": "Nonsense"}

        self.db_proxy.insert_product(twin_product_1)
        tasks.process_product(self.session_context, id_twin_product_1)
        self.db_proxy.insert_product(twin_product_2)
        tasks.process_product(self.session_context, id_twin_product_2)

        # makes it so that all users consume (and have impressions on) the twins, except for the target user
        users = self.db_proxy.fetch_all_user_ids()
        for user in users:
            if user != target:
                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_1,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_2,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                if self.session_context.impressions_enabled:
                    is_anonymous = config.is_anonymous(user)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_1,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_2,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)
        ut.generate_templates(self.session_context)
        pt.generate_templates(self.session_context)
        pttfidf.generate_templates(self.session_context)  # Unfortunately we need to regenerate from scratch,
                                                          # otherwise the df's of the twins will be different.

        # First, we recommend WITHOUT near-identical filtering, to check that the twins really appear consecutively.

        custom_settings = {'near_identical_filter_field': None,
                           'near_identical_filter_threshold': None}

        session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm)
        session.refresh()

        recommender = session.get_recommender()

        if not recommender.is_hybrid():
        # For hybrid recommenders, this check is meaningless.

            recommendations = recommender.recommend(100)

            twin_index = -1
            for idx, recommendation in enumerate(recommendations):
                if recommendation[1].startswith("p_tec_TWIN_"):
                    if twin_index >= 0:
                        nose.tools.eq_(idx - twin_index, 1,
                                       "The two near-identical products should appear consecutively without filtering")
                        break
                    twin_index = idx

        # Now we recommend WITH near-identical filtering

        recommendation_page_size = 5
        custom_settings = {'near_identical_filter_field': 'resources.title',
                           'near_identical_filter_threshold': 2,
                           'recommendations_page_size': recommendation_page_size}

        session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm)
        session.refresh()

        recommender = session.get_recommender()
        recommendations = recommender.recommend(100)

        # Sanity check
        recommended_products = {r[1] for r in recommendations}
        count_recommended_twins = len({id_twin_product_1, id_twin_product_2} & recommended_products)
        nose.tools.ok_(count_recommended_twins > 0,
                       "At least one of the twins should have been recommended, otherwise the test is meaningless")

        # Actual tests
        twin_index = -1 * recommendation_page_size - 1  # initial value, so the first twin passes the test
        for idx, recommendation in enumerate(recommendations):
            if recommendation[1].startswith("p_tec_TWIN_"):
                nose.tools.ok_(idx - twin_index > 1,  # it suffices to show that the twins have been separated
                               "Two near-identical products should not appear within the same recommendations page")
                twin_index = idx
Пример #11
0
    def test_product_age_decay_exponential(self):
        """ Tests the effect of applying a product age decay factor based on an exponential
            function on recommendations. It applies to all recommendation heuristics.
        """
        target = "u_tec_1"

        id_twin_product_old = "p_tec_TWIN_OLD"
        id_twin_product_new = "p_tec_TWIN_NEW"

        # makes it so that the oldest twin is 2 days (the configured half life) older
        old_date = self.session_context.get_present_date() - dt.timedelta(days=2)
        new_date = self.session_context.get_present_date()

        twin_product_old = {"external_id": id_twin_product_old,
                            "language": "english",
                            "date": old_date,
                            "expiration_date": old_date + dt.timedelta(days=30),
                            "resources": {"title": "Whatever Gets You Through The Night"},
                            "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                            "category": "Nonsense"}

        twin_product_new = {"external_id": id_twin_product_new,
                            "language": "english",
                            "date": new_date,
                            "expiration_date": new_date + dt.timedelta(days=30),
                            "resources": {"title": "Whatever Gets You Through The Night"},
                            "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                            "category": "Nonsense"}

        self.db_proxy.insert_product(twin_product_old)
        tasks.process_product(self.session_context, id_twin_product_old)
        self.db_proxy.insert_product(twin_product_new)
        tasks.process_product(self.session_context, id_twin_product_new)

        # makes it so that all users consume (and have impressions on) the twins, except for the target user
        users = self.db_proxy.fetch_all_user_ids()
        for user in users:
            if user != target:
                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_old,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_new,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                if self.session_context.impressions_enabled:
                    is_anonymous = config.is_anonymous(user)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_old,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_new,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)

        ut.generate_templates(self.session_context)
        pt.generate_templates(self.session_context)
        pttfidf.generate_templates(self.session_context)  # Unfortunately we need to regenerate from scratch,
                                                          # otherwise the df's of the twins will be different.

        custom_settings = {'product_age_decay_function_name': 'exponential',
                           'product_age_decay_exponential_function_halflife': 2,
                           'near_identical_filter_field': None, 'near_identical_filter_threshold': None}

        # Disables near-identical filtering
        session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm)
        session.refresh()

        recommender = session.get_recommender()

        # Determines the index of the first actual value in the score tuples
        # produced by the recommender (note that hybrid recommenders use the first
        # position to indicate the algorithm number)
        if recommender.is_hybrid():
            start_index = 1
        else:
            start_index = 0

        recommendations = recommender.recommend(100)
        nose.tools.ok_(len(recommendations) > 0, "No recommendations were returned!")

        strength_old_twin = None
        strength_new_twin = None

        for rec in recommendations:
            if rec[1] == id_twin_product_old:
                strength_old_twin = rec[0]
            if rec[1] == id_twin_product_new:
                strength_new_twin = rec[0]

        for i in range(start_index, len(strength_old_twin)):
            old_strength_value = strength_old_twin[i]
            new_strength_value = strength_new_twin[i]
            nose.tools.ok_(abs(old_strength_value / new_strength_value - 0.5) < tests.FLOAT_DELTA,
                           "Incorrect application of the product age decay")