def main(argv): if len(argv) < 2: msg = "You must specify the environment and the document id" log.error(msg) return {"success": False, "message": msg} try: # command-line arguments env = argv[0] product = json.loads(argv[1]) product_id = product.get("external_id") if product_id is None: msg = "Product has no external_id" log.error(msg) return {"success": False, "message": msg} session = init_session(env) maintenance.process_product(session, product_id, product=product, force_update=True) except Exception: log.exception('Exception on {0}:'.format(__name__)) return {"success": False, "message": traceback.format_exc()} return {"success": True}
def main(argv): if len(argv) < 2: msg = "You must specify the environment, " \ "the product id (or --all, or --resume) and the number of days (or --complete) if using --all." log.error(msg) return {"success": False, "message": msg} try: # command-line arguments env = argv[0] product_id = argv[1] session = init_session(env) if product_id == "--all": if argv[2] == "--complete": days = None else: days = int(argv[2]) maintenance.process_products(session, days) elif product_id == "--resume": maintenance.process_products(session, resume=True) else: if len(argv) == 3 and argv[2] == '--force': force = True else: force = False maintenance.process_product(session, product_id, force_update=force) except Exception: log.exception('Exception on {0}:'.format(__name__)) return {"success": False, "message": traceback.format_exc()} return {"success": True}
def test_tfidf_repeated_calls(self): """ Tests the df of (language, term) pairs after processing a same document several times. """ maintenance.process_products(self.session_context) maintenance.process_product(self.session_context, "p_mus_4") maintenance.process_product(self.session_context, "p_mus_4") tfidf_by_term = self.db_proxy.fetch_tfidf_map(self.text_field, ["p_mus_4"]).get("p_mus_4", {}) nose.tools.ok_(abs(tfidf_by_term.get("músic") - 1) < tests.FLOAT_DELTA)
def test_df_repeated_calls(self): """ Tests the df of (language, term) pairs after processing a same document several times. """ maintenance.process_product(self.session_context, "p_mus_4") maintenance.process_product(self.session_context, "p_mus_4") nose.tools.eq_(self.db_proxy.find_df("portuguese", "rock"), 4) nose.tools.eq_(self.db_proxy.find_df("english", "rock"), 1) nose.tools.eq_(self.db_proxy.find_df("english", "merc"), 0)
def test_tf_repeated_calls(self): """ Tests the tf of the terms in a document after processing the document more than once. """ product = "p_aut_1" maintenance.process_product(self.session_context, product) maintenance.process_product(self.session_context, product) tf_map = self.db_proxy.fetch_tf_map(self.text_field, [product]).get(product) nose.tools.eq_(tf_map["civic"], 2) nose.tools.eq_(tf_map["coroll"], 2) nose.tools.eq_(tf_map["merc"], 2) nose.tools.eq_(tf_map["consum"], 1)
def test_product_product_strengths_tfidf_from_scratch_versus_incremental(self): """ Tests whether the product x product strengths (TFIDF) generated on a step-by-step basis match exactly those created from scratch. """ # inner method to compare strengths def compare_strengths(pair_of_products): strength1 = strengths_incremental.get(pair_of_products, 0.0) strength2 = strengths_from_scratch[pair_of_products] nose.tools.ok_( "Strengths do not match for product pair (%s, %s): " % (pair_of_products[0], pair_of_products[1]) + "[incremental --> %.6f] [from scratch --> %.6f]" % (strength1, strength2), abs(strength1 - strength2) < ACCEPTABLE_ON_THE_FLY_VS_FROM_SCRATCH_DEVIATION) # --- # inner method to compare templates tfidf def compare_templates(product): templates1 = templates_incremental.get(product, (None, [])) templates2 = templates_from_scratch.get(product, (None, [])) nose.tools.eq_(len(templates1[1]), len(templates2[1]), "Numbers of incremental and from-scratch templates do not match") for idx in range(len(templates1[1])): strength_incremental = templates1[1][idx][0] strength_from_scratch = templates2[1][idx][0] nose.tools.ok_( abs(strength_incremental - strength_from_scratch) < ACCEPTABLE_ON_THE_FLY_VS_FROM_SCRATCH_DEVIATION, "Templates do not approximately match for product %s: " % product + "[incremental --> %s] [from scratch --> %s]" % (str(templates1), str(templates2))) # --- all_products = list(self.db_proxy.fetch_all_product_ids()) sentence = " produto para teste de atualização de similaridade via tfidf" products = [{"external_id": product[0], "resources": {"title": product[0]}, "date": self.session_context.get_present_date(), "expiration_date": self.session_context.get_present_date() + dt.timedelta(days=30), "full_content": product[1], "language": "portuguese"} for product in [("p_new_1", "Primeiro" + sentence), ("p_new_2", "Segundo" + sentence), ("p_new_3", "Terceiro" + sentence), ("p_new_4", "Quarto" + sentence)]] # updates strengths after each new product for product in products: self.db_proxy.insert_product(product) maintenance.process_product(self.session_context, product["external_id"]) # saves locally the strengths and the templates that were obtained incrementally strengths_incremental = self.db_proxy.fetch_product_product_strengths_tfidf() templates_incremental = self.db_proxy.fetch_product_templates(all_products) # regenerates all strengths from scratch pttfidf.generate_templates(self.session_context) # saves locally the strengths and the templates that were obtained from scratch strengths_from_scratch = self.db_proxy.fetch_product_product_strengths_tfidf() templates_from_scratch = self.db_proxy.fetch_product_templates(all_products) nose.tools.eq_(len(strengths_incremental), len(strengths_from_scratch), "Number of non-zero strengths tfidf do not match") for product_pair in strengths_from_scratch: compare_strengths(product_pair) for product_pair in strengths_incremental: compare_strengths(product_pair) for product in all_products: compare_templates(product)
def test_near_identical(self): """ Tests that two products considered 'near-identical' are not recommended at the same time (within the same page) when the filtering strategy is AFTER_SCORING. """ target = "u_tec_1" id_twin_product_1 = "p_tec_TWIN_1" id_twin_product_2 = "p_tec_TWIN_2" date = self.session_context.get_present_date() - dt.timedelta(days=1) twin_product_1 = {"external_id": id_twin_product_1, "language": "english", "date": date, "expiration_date": date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} twin_product_2 = {"external_id": id_twin_product_2, "language": "english", "date": date, "expiration_date": date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through This Night is Alright"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} self.db_proxy.insert_product(twin_product_1) tasks.process_product(self.session_context, id_twin_product_1) self.db_proxy.insert_product(twin_product_2) tasks.process_product(self.session_context, id_twin_product_2) # makes it so that all users consume (and have impressions on) the twins, except for the target user users = self.db_proxy.fetch_all_user_ids() for user in users: if user != target: activity = {"external_user_id": user, "external_product_id": id_twin_product_1, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) activity = {"external_user_id": user, "external_product_id": id_twin_product_2, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) if self.session_context.impressions_enabled: is_anonymous = config.is_anonymous(user) self.db_proxy.increment_impression_summary(user, id_twin_product_1, date=self.session_context.get_present_date(), anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user, id_twin_product_2, date=self.session_context.get_present_date(), anonymous=is_anonymous) ut.generate_templates(self.session_context) pt.generate_templates(self.session_context) pttfidf.generate_templates(self.session_context) # Unfortunately we need to regenerate from scratch, # otherwise the df's of the twins will be different. # First, we recommend WITHOUT near-identical filtering, to check that the twins really appear consecutively. custom_settings = {'near_identical_filter_field': None, 'near_identical_filter_threshold': None} session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() if not recommender.is_hybrid(): # For hybrid recommenders, this check is meaningless. recommendations = recommender.recommend(100) twin_index = -1 for idx, recommendation in enumerate(recommendations): if recommendation[1].startswith("p_tec_TWIN_"): if twin_index >= 0: nose.tools.eq_(idx - twin_index, 1, "The two near-identical products should appear consecutively without filtering") break twin_index = idx # Now we recommend WITH near-identical filtering recommendation_page_size = 5 custom_settings = {'near_identical_filter_field': 'resources.title', 'near_identical_filter_threshold': 2, 'recommendations_page_size': recommendation_page_size} session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() recommendations = recommender.recommend(100) # Sanity check recommended_products = {r[1] for r in recommendations} count_recommended_twins = len({id_twin_product_1, id_twin_product_2} & recommended_products) nose.tools.ok_(count_recommended_twins > 0, "At least one of the twins should have been recommended, otherwise the test is meaningless") # Actual tests twin_index = -1 * recommendation_page_size - 1 # initial value, so the first twin passes the test for idx, recommendation in enumerate(recommendations): if recommendation[1].startswith("p_tec_TWIN_"): nose.tools.ok_(idx - twin_index > 1, # it suffices to show that the twins have been separated "Two near-identical products should not appear within the same recommendations page") twin_index = idx
def test_product_age_decay_exponential(self): """ Tests the effect of applying a product age decay factor based on an exponential function on recommendations. It applies to all recommendation heuristics. """ target = "u_tec_1" id_twin_product_old = "p_tec_TWIN_OLD" id_twin_product_new = "p_tec_TWIN_NEW" # makes it so that the oldest twin is 2 days (the configured half life) older old_date = self.session_context.get_present_date() - dt.timedelta(days=2) new_date = self.session_context.get_present_date() twin_product_old = {"external_id": id_twin_product_old, "language": "english", "date": old_date, "expiration_date": old_date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} twin_product_new = {"external_id": id_twin_product_new, "language": "english", "date": new_date, "expiration_date": new_date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} self.db_proxy.insert_product(twin_product_old) tasks.process_product(self.session_context, id_twin_product_old) self.db_proxy.insert_product(twin_product_new) tasks.process_product(self.session_context, id_twin_product_new) # makes it so that all users consume (and have impressions on) the twins, except for the target user users = self.db_proxy.fetch_all_user_ids() for user in users: if user != target: activity = {"external_user_id": user, "external_product_id": id_twin_product_old, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) activity = {"external_user_id": user, "external_product_id": id_twin_product_new, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) if self.session_context.impressions_enabled: is_anonymous = config.is_anonymous(user) self.db_proxy.increment_impression_summary(user, id_twin_product_old, date=self.session_context.get_present_date(), anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user, id_twin_product_new, date=self.session_context.get_present_date(), anonymous=is_anonymous) ut.generate_templates(self.session_context) pt.generate_templates(self.session_context) pttfidf.generate_templates(self.session_context) # Unfortunately we need to regenerate from scratch, # otherwise the df's of the twins will be different. custom_settings = {'product_age_decay_function_name': 'exponential', 'product_age_decay_exponential_function_halflife': 2, 'near_identical_filter_field': None, 'near_identical_filter_threshold': None} # Disables near-identical filtering session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() # Determines the index of the first actual value in the score tuples # produced by the recommender (note that hybrid recommenders use the first # position to indicate the algorithm number) if recommender.is_hybrid(): start_index = 1 else: start_index = 0 recommendations = recommender.recommend(100) nose.tools.ok_(len(recommendations) > 0, "No recommendations were returned!") strength_old_twin = None strength_new_twin = None for rec in recommendations: if rec[1] == id_twin_product_old: strength_old_twin = rec[0] if rec[1] == id_twin_product_new: strength_new_twin = rec[0] for i in range(start_index, len(strength_old_twin)): old_strength_value = strength_old_twin[i] new_strength_value = strength_new_twin[i] nose.tools.ok_(abs(old_strength_value / new_strength_value - 0.5) < tests.FLOAT_DELTA, "Incorrect application of the product age decay")