def main(argv):
    if len(argv) < 1:
        msg = "You must specify the environment"
        log.error(msg)
        return {"success": False, "message": msg}
    try:
        # command-line arguments
        env = argv[0]
        session = init_session(env)

        product_ids = None
        if len(argv) >= 2:
            product_ids = argv[1]
            if product_ids != "--all":
                product_ids = argv[1].split(",")

        timestamp = session.get_present_date()
        start = time()

        latest_run = session.data_proxy.fetch_latest_batch_info_product_template_consolidation()
        if latest_run:
            if latest_run.get("status") == "running":
                msg = "An old consolidation batch is still running. Won't start another one."
                log.info(msg)
                return {"success": False, "message": msg}

        session.data_proxy.save_timestamp_product_template_consolidation(status="running", timestamp=timestamp)

        consolidate_product_templates(session, product_ids)
        session.data_proxy.ensure_indexes_cache()

        elapsed_time = time() - start

        session.data_proxy.save_timestamp_product_template_consolidation(
            status="success", timestamp=timestamp, elapsed_time=elapsed_time
        )

        return {"success": True}

    except Exception:
        log.exception("Exception on {0}:".format(__name__))

        session.data_proxy.save_timestamp_product_template_consolidation(status="failed", timestamp=timestamp)

        return {"success": False, "message": traceback.format_exc()}
Пример #2
0
def generate_templates(session_context):
    generate_strengths(session_context)
    consolidate_product_templates(session_context, collaborative=True, tfidf=False)
Пример #3
0
def update_templates(
    session_context,
    new_activity,
    u_p_activities_summary=None,
    first_impression_date=None,
    should_lookup_activities_summary=True,
    should_lookup_first_impression=True,
):
    """ Updates product x product strengths based on a single new activity.

        :param session_context: The session context.
        :param new_activity: a dict {"external_user_id": user_id,
                                     "external_product_id": product_id,
                                     "activity": activity_type,
                                     "created_at": datetime}.
        :param u_p_activities_summary: The summary of activities for that (user, product) pair, if any,
            in the form of a dict {"external_user_id": the user id,
                                   "external_product_id": the product id,
                                   "activity": the latest activity type,
                                   "created_at": the datetime of the latest activity,
                                   "pp_latest_type": the type of the latest activity to be processed for
                                                     that pair during p-p strengths calculation,
                                   "pp_latest_date": the date of the latest activity to be processed for
                                                     that pair during p-p strengths calculation}.
        :param first_impression_date: The date of the first impression, if any, the activity user has received on
            the activity product.
        :param should_lookup_activities_summary: If True and previous_activity is None, it queries the database
            for the previous activity.
        :param should_lookup_first_impression: If True and first_impression_date is None, it queries the database
            for the first impression.
    """
    log.info("Computing product-product strengths...")

    user = new_activity["external_user_id"]
    if config.is_anonymous(user):
        log.info("Anonymous users should not affect product-product strengths! Exiting now.")
        return

    product = new_activity["external_product_id"]
    activity_date = new_activity["created_at"]
    activity_type = new_activity["activity"]
    rating = session_context.rating_by_activity.get(activity_type)
    if rating is None:
        log.error("Unsupported activity type: %s" % activity_type)
        return

    suggested_cutoff_date = session_context.get_present_date() - dt.timedelta(
        session_context.product_product_strengths_window
    )
    latest_batch_info = session_context.data_proxy.fetch_latest_batch_info_product_product_strengths()
    if latest_batch_info is not None:
        latest_batch_timestamp = latest_batch_info["timestamp"]
        persisted_cutoff_date = latest_batch_info.get("cutoff_date")
        if persisted_cutoff_date is None:
            cutoff_date = suggested_cutoff_date
        else:
            cutoff_date = max(persisted_cutoff_date, suggested_cutoff_date)
    else:
        latest_batch_timestamp = None
        cutoff_date = suggested_cutoff_date

    if session_context.impressions_enabled and first_impression_date is None and should_lookup_first_impression:
        product_user_impressions_summary = (
            session_context.data_proxy.fetch_impressions_summary(
                product_ids=[product], user_ids=[user], group_by_product=True, anonymous=False
            )
            .get(product, {})
            .get(user, (0, None))
        )
        first_impression_date = product_user_impressions_summary[1]

    if u_p_activities_summary is None and should_lookup_activities_summary:
        u_p_activities_summary_as_singleton_list = session_context.data_proxy.fetch_activity_summaries_by_user(
            user_ids=[user], product_ids=[product], indexed_fields_only=False, anonymous=False
        ).get(user, [])
        if len(u_p_activities_summary_as_singleton_list) > 0:
            u_p_activities_summary = u_p_activities_summary_as_singleton_list[0]

    previous_activity_rating = 0
    if u_p_activities_summary is not None:
        previous_activity_type = u_p_activities_summary.get("pp_latest_type")
        if previous_activity_type is not None:
            previous_activity_rating = session_context.rating_by_activity[previous_activity_type]
            previous_activity_date = u_p_activities_summary["pp_latest_date"]

    if previous_activity_rating == rating and not session_context.impressions_enabled:
        return  # repeating the latest activity --- there is nothing to do here
        # (if using impressions, must recalculate anyway to account for latest impressions)

    numerator_diff = [0, 0]
    denominator_diff = 0

    remove_previous_activity_contribution = previous_activity_rating >= min(
        session_context.min_rating_conservative, session_context.min_rating_recommendable_from_product
    )
    if remove_previous_activity_contribution:
        if session_context.impressions_enabled:
            if first_impression_date is not None:
                # must remove former contribution if impression was already processed incrementally
                remove_previous_activity_contribution = previous_activity_date >= first_impression_date
                # must remove also if generation from scratch happened after the first impression
                if not remove_previous_activity_contribution and latest_batch_timestamp is not None:
                    remove_previous_activity_contribution = latest_batch_timestamp >= first_impression_date

    # Removes the former contribution of the previous commanding activity for that (user, product) pair.
    if remove_previous_activity_contribution:
        if previous_activity_rating >= session_context.min_rating_conservative:
            numerator_diff[CONSERVATIVE] -= 1
        if previous_activity_rating >= session_context.min_rating_aggressive:
            numerator_diff[AGGRESSIVE] -= 1
        if previous_activity_rating >= session_context.min_rating_recommendable_from_product:
            denominator_diff -= 1

    # Adds the contribution of this activity.
    if rating >= session_context.min_rating_conservative:
        numerator_diff[CONSERVATIVE] += 1
    if rating >= session_context.min_rating_aggressive:
        numerator_diff[AGGRESSIVE] += 1
    if rating >= session_context.min_rating_recommendable_from_product:
        denominator_diff += 1

    # Fetches all the products consumed by this user.
    products_by_rating = session_context.data_proxy.fetch_products_by_rating_by_user(
        user_ids=[user], min_date=cutoff_date, max_date=session_context.get_present_date()
    )[0].get(user, {})

    # Includes the product of the current activity (remember: this activity might not have been saved yet)
    products_set = products_by_rating.get(rating, set())
    products_set.add(product)
    products_by_rating[rating] = products_set
    if u_p_activities_summary is not None:
        products_set = products_by_rating.get(previous_activity_rating, set())
        if product in products_set:
            products_set.remove(product)
            products_by_rating[previous_activity_rating] = products_set

    products_rated_conservatively_high = set()
    for r in range(session_context.min_rating_conservative, 6):
        products_rated_conservatively_high |= products_by_rating.get(r, set())
    products_rated_aggressively_high = set()
    for r in range(session_context.min_rating_aggressive, 6):
        products_rated_aggressively_high |= products_by_rating.get(r, set())
    products_rated_sufficiently_for_recommendation = set()
    for r in range(session_context.min_rating_recommendable_from_product, 6):
        products_rated_sufficiently_for_recommendation |= products_by_rating.get(r, set())

    numerators_with_product_as_template = None
    denominators_with_product_as_template = None
    numerators_with_product_as_base = None
    denominators_with_product_as_base = None
    strengths_map_for_insert = {}
    strengths_map_for_update = {}

    # This product as TEMPLATE

    # If this product has been consumed by this user without previous impressions, then it shall not contribute
    # for product-product strengths with this product as template.
    update_product_as_template = True
    if session_context.impressions_enabled:
        update_product_as_template = first_impression_date is not None

    # Existing pairs with product as template.

    if update_product_as_template and numerator_diff != [0, 0]:

        strength_operands_with_product_as_template = session_context.data_proxy.fetch_product_product_strength_operands(
            templates=[product]
        )
        numerators_with_product_as_template = strength_operands_with_product_as_template[0]
        denominators_with_product_as_template = strength_operands_with_product_as_template[1]

        for product_and_template, numerator_tuple in numerators_with_product_as_template.items():
            base_product = product_and_template[0]
            if base_product in products_rated_sufficiently_for_recommendation:
                new_numerator_tuple = [numerator_tuple[0] + numerator_diff[0], numerator_tuple[1] + numerator_diff[1]]
                numerators_with_product_as_template[product_and_template] = new_numerator_tuple
                update_doc = strengths_map_for_update.get(product_and_template, {})
                update_doc["nc"] = new_numerator_tuple[CONSERVATIVE]
                update_doc["na"] = new_numerator_tuple[AGGRESSIVE]
                strengths_map_for_update[product_and_template] = update_doc

    # New pairs with product as template.

    if update_product_as_template and numerator_diff[0] == 1:  # if this user has *just* rated this product high...
        new_base_products = []
        for base_product in products_rated_sufficiently_for_recommendation:
            if base_product != product and (base_product, product) not in numerators_with_product_as_template:
                new_base_products += [base_product]
                new_numerator_tuple = [
                    1 if rating >= session_context.min_rating_conservative else 0,
                    1 if rating >= session_context.min_rating_aggressive else 0,
                ]
                numerators_with_product_as_template[(base_product, product)] = new_numerator_tuple
                update_doc = strengths_map_for_insert.get((base_product, product), {})
                update_doc["nc"] = new_numerator_tuple[CONSERVATIVE]
                update_doc["na"] = new_numerator_tuple[AGGRESSIVE]
                strengths_map_for_insert[(base_product, product)] = update_doc

        users_by_rating_by_new_base_product = session_context.data_proxy.fetch_users_by_rating_by_product(
            product_ids=new_base_products, min_date=cutoff_date, max_date=session_context.get_present_date()
        )[0]

        for new_base_product in new_base_products:
            source_users = set()
            for r in range(session_context.min_rating_recommendable_from_product, 6):
                source_users |= users_by_rating_by_new_base_product[new_base_product][r]
            if session_context.impressions_enabled:
                # Retrieves the intersection of the top-rated users of the base product
                # with the users with impressions for the template product
                source_users_with_impressions = session_context.data_proxy.fetch_users_with_impressions_by_product(
                    product_ids=[product], user_ids=list(source_users), anonymous=False
                ).get(product, set())
                new_denominator = len(source_users_with_impressions)
            else:
                new_denominator = len(source_users)
            denominators_with_product_as_template[(new_base_product, product)] = new_denominator
            insert_doc = strengths_map_for_insert.get((new_base_product, product), {})
            insert_doc["denominator"] = new_denominator
            strengths_map_for_insert[(new_base_product, product)] = insert_doc

    # This product as BASE PRODUCT

    # Existing pairs with product as base product.

    if session_context.bidirectional_pp_strength_updates and denominator_diff != 0:
        product_product_strength_operands = session_context.data_proxy.fetch_product_product_strength_operands(
            products=[product]
        )
        numerators_with_product_as_base = product_product_strength_operands[0]
        denominators_with_product_as_base = product_product_strength_operands[1]

        for product_and_template in denominators_with_product_as_base:
            # updates the denominator...
            denominator = denominators_with_product_as_base[product_and_template]
            new_denominator = denominator + denominator_diff
            denominators_with_product_as_base[product_and_template] = new_denominator
            update_doc = strengths_map_for_update.get(product_and_template, {})
            update_doc["denominator"] = new_denominator
            strengths_map_for_update[product_and_template] = update_doc

            # ...and the numerator, in case the template product has been consumed by this user
            if (
                product_and_template[1] in products_rated_conservatively_high
                and product_and_template in numerators_with_product_as_base
            ):
                numerator_tuple = numerators_with_product_as_base[product_and_template]
                numerator_tuple[CONSERVATIVE] += denominator_diff
                if product_and_template[1] in products_rated_aggressively_high:
                    numerator_tuple[AGGRESSIVE] += denominator_diff
                numerators_with_product_as_base[product_and_template] = numerator_tuple
                update_doc = strengths_map_for_update.get(product_and_template, {})
                update_doc["nc"] = numerator_tuple[CONSERVATIVE]
                update_doc["na"] = numerator_tuple[AGGRESSIVE]
                strengths_map_for_update[product_and_template] = update_doc

    # New pairs with product as base product.

    if session_context.bidirectional_pp_strength_updates and denominator_diff == 1:
        # if this product has *just* been rated at least conservatively high...
        new_templates = []
        for template in products_rated_conservatively_high:
            if template != product and (product, template) not in denominators_with_product_as_base:  # new pair
                new_templates += [template]

        if len(new_templates) > 0:
            users_of_product_as_base = session_context.data_proxy.fetch_users_by_rating_by_product(
                product_ids=[product], min_date=cutoff_date, max_date=session_context.get_present_date()
            )[0].get(product, {})
            # Includes the user of the current activity (remember again: this activity might not have been saved yet)
            users_set = users_of_product_as_base.get(rating, set())
            users_set.add(user)
            users_of_product_as_base[rating] = users_set

            recommending_users_of_product_as_base = set()
            for r in range(session_context.min_rating_recommendable_from_product, 6):
                recommending_users_of_product_as_base |= users_of_product_as_base.get(r, set())

            if session_context.impressions_enabled:
                user_impressions_by_template = session_context.data_proxy.fetch_impressions_summary(
                    product_ids=new_templates,
                    user_ids=list(recommending_users_of_product_as_base),
                    group_by_product=True,
                    anonymous=False,
                )

            for new_template in new_templates:
                if session_context.impressions_enabled:
                    new_denominator = len(user_impressions_by_template.get(new_template, []))
                else:
                    new_denominator = len(recommending_users_of_product_as_base)
                denominators_with_product_as_base[(product, new_template)] = new_denominator
                insert_doc = strengths_map_for_insert.get((product, new_template), {})
                insert_doc["denominator"] = new_denominator
                strengths_map_for_insert[(product, new_template)] = insert_doc

            for new_template in new_templates:
                if new_template in products_rated_conservatively_high:
                    numerator_tuple = numerators_with_product_as_base.get((product, new_template), [0, 0])
                    numerator_tuple[CONSERVATIVE] += 1
                    if new_template in products_rated_aggressively_high:
                        numerator_tuple[AGGRESSIVE] += 1
                    numerators_with_product_as_base[(product, new_template)] = numerator_tuple
                    insert_doc = strengths_map_for_insert.get((product, new_template), {})
                    insert_doc["nc"] = numerator_tuple[CONSERVATIVE]
                    insert_doc["na"] = numerator_tuple[AGGRESSIVE]
                    strengths_map_for_insert[(product, new_template)] = insert_doc

    # Computes all affected strengths for UPDATE

    if len(strengths_map_for_update) > 0:
        _prepare_strengths_map(
            session_context,
            product,
            strengths_map_for_update,
            numerators_with_product_as_base,
            denominators_with_product_as_base,
            numerators_with_product_as_template,
            denominators_with_product_as_template,
        )

        log.info("Saving product-product strengths (UPDATE)...")
        session_context.data_proxy.save_pp_strengths(strengths_map_for_update, upsert=True)
        log.info("[{0}] product-product strengths updated".format(len(strengths_map_for_update)))
    else:
        log.info("No old strengths to update.")

    # Computes all affected strengths for INSERT

    if len(strengths_map_for_insert) > 0:
        _prepare_strengths_map(
            session_context,
            product,
            strengths_map_for_insert,
            numerators_with_product_as_base,
            denominators_with_product_as_base,
            numerators_with_product_as_template,
            denominators_with_product_as_template,
        )

        log.info("Saving product-product strengths (INSERT)...")
        session_context.data_proxy.save_pp_strengths(strengths_map_for_insert, upsert=False)
        log.info("[{0}] product-product strengths inserted".format(len(strengths_map_for_insert)))
    else:
        log.info("No new strengths to insert.")

    # Consolidates cached product templates

    log.info("Determining products whose templates must be consolidated...")
    products_to_consolidate = {product_and_template[0] for product_and_template in strengths_map_for_insert}

    updated_products = {product_and_template[0] for product_and_template in strengths_map_for_update}
    old_templates_map = session_context.data_proxy.fetch_product_templates(list(updated_products))
    for product_and_template, strength_doc in strengths_map_for_update.items():
        base_product = product_and_template[0]
        template_product = product_and_template[1]
        cutoff_strength = 0
        old_template_ids = set()
        old_templates = old_templates_map.get(base_product)
        if old_templates:
            old_templates_collaborative = old_templates[0]
            if old_templates_collaborative:
                cutoff_strength = old_templates_collaborative[-1][0]  # the strength of the weakest template
                if isinstance(cutoff_strength, str):
                    cutoff_strength = 0
                old_template_ids = {t[1] for t in old_templates_collaborative}
        if (
            strength_doc["strength"] > cutoff_strength
            or template_product in old_template_ids
            or len(old_template_ids) < 3 * session_context.product_templates_count
        ):
            products_to_consolidate.add(base_product)

    if session_context.should_consolidate_product_templates_on_the_fly:
        if len(products_to_consolidate) > 0:
            log.info("Consolidating templates of %d products..." % len(products_to_consolidate))
            consolidate_product_templates(
                session_context, products_list=list(products_to_consolidate), collaborative=True, tfidf=False
            )
        else:
            log.info("No products with templates to consolidate.")

    session_context.data_proxy.save_latest_activity_for_product_product_strengths(
        user, product, activity_type, activity_date
    )

    log.info("PP strengths and templates updated successfully.")
Пример #4
0
def update_templates(session_context, product_id, language, tfidf_by_top_term_by_attribute):
    """ Updates product-product strengths based on their content.

        The attributes which are taken into consideration are those defined in the
        customer config file PRODUCT_MODEL entry. Product attributes whose 'similarity_filter'
        is set to true must be equal so that two products must have non-zero mutual similarity.
        Product attributes whose 'similarity_weight' is strictly positive are linearly combined
        according to the assigned weights.

        This function does not recreate all strengths from scratch; rather, it updates
        the strengths of all product-product pairs containing the product whose *product_id* is given.

        :param session_context: The session context.
        :param product_id: The intended product.
        :param language: The language of the product being processed.
        :param tfidf_by_top_term_by_attribute: A map {attribute: {term: tfidf}}, containing the TFIDF's of
            the top TFIDF terms in each of the TEXT-type attribute of the product being processed.
    """
    strengths = {}

    text_fields = session_context.product_text_fields
    cutoff_date = session_context.get_present_date() - dt.timedelta(
        session_context.product_product_strengths_tfidf_window)

    product_models = {}

    # Processes each TEXT attribute.
    for attribute in text_fields:
        weight = session_context.similarity_weights_by_type[pm.TEXT].get(attribute, 0)
        if weight == 0:
            continue

        log.info("Fetching products with common terms in attribute [%s]..." % attribute)
        terms = [term for term in tfidf_by_top_term_by_attribute.get(attribute, [])]
        new_product_models = session_context.data_proxy.fetch_product_models_for_top_tfidf_terms(
            attribute, language, terms, min_date=cutoff_date, max_date=session_context.get_present_date())
        product_models.update(new_product_models)

        if len(new_product_models) > 1:  # we require at least one product model other than that of the current product
            product_ids_list = [p_id for p_id in new_product_models]

            log.info("Fetching TFIDF maps for attribute [%s] in [%d] products..." % (attribute, len(product_ids_list)))
            tfidf_by_term_by_product = session_context.data_proxy.fetch_tfidf_map(attribute, product_ids_list)

            log.info("Computing strengths...")
            _process_text_attribute_contributions(strengths, tfidf_by_term_by_product, weight, product_id)

    # Processes the non-TEXT attributes.
    _process_non_text_attributes_contributions(session_context, product_models, strengths)

    # Persists the updated strengths.
    log.info("Saving strengths tfidf...")
    strengths_list = [{"product": product_pair[0],
                       "template_product": product_pair[1],
                       "strength": value if value >= MIN_ACCEPTABLE_PP_STRENGTH_TFIDF else 0}
                      for product_pair, value in strengths.items()]
    session_context.data_proxy.save_product_product_strengths_tfidf(strengths_list)

    # Consolidates cached product templates

    log.info("Determining products whose templates tfidf must be consolidated...")
    products_to_consolidate = set()
    updated_products = {product_and_template[0] for product_and_template in strengths}
    old_templates_map = session_context.data_proxy.fetch_product_templates(list(updated_products))

    for product_and_template, strength in strengths.items():
        base_product = product_and_template[0]
        template_product = product_and_template[1]

        should_consolidate = True
        old_templates = old_templates_map.get(base_product)
        if old_templates is not None:
            if len(old_templates[1]) > 0:
                cutoff_strength = old_templates[1][-1][0]  # the strength of the weakest template tfidf
                old_template_ids = {t[1] for t in old_templates[1]}
                if strength <= cutoff_strength and \
                   template_product not in old_template_ids and \
                   len(old_templates) >= 3 * session_context.product_templates_count:
                    should_consolidate = False

        if should_consolidate:
            products_to_consolidate.add(base_product)

    if len(products_to_consolidate) > 0:
        log.info("Consolidating templates of %d products..." % len(products_to_consolidate))
        consolidate_product_templates(session_context, products_list=list(products_to_consolidate),
                                      collaborative=False, tfidf=True)