def __get_rec_item_ids(self, item_id, sim_cat_ids): """ 获取推荐item_id Args: item_id: query的item_id sim_cat_ids : item_id的相似cat_id列表 Returns: [(rec_item_id, sim_value), (rec_item_id, sim_value), ...],长度为strategy.get_total_rec_num() """ target_item = self.dim_items_index.get(item_id, -1) if target_item == -1: write_log(sys._getframe().f_lineno, "cannot get info of item_id:%d" % (item_id) ) return rec_items = [] sim_cat_ids = sim_cat_ids[0: self.strategy.get_max_sim_cat_process()] count = 0 timer_total = Timer() for i in range(0, len(sim_cat_ids)): (cat_id, sim_value) = sim_cat_ids[i] sim_item_ids = self.cat_to_item_rindex.get(cat_id, []) if len(sim_item_ids) == 0 : write_log(sys._getframe().f_lineno, "cat_id:%d has no item" % (cat_id) ) continue timer = Timer() count += len(sim_item_ids) res_list = self.__find_sim_item_from_a_list(target_item, sim_item_ids, i) write_log(msg = "__find_sim_item_from_a_list cost time:%f, sim_item_ids size:%d, i:%d" % (timer.get_diff(), len(sim_item_ids), i)) rec_items.extend(res_list) write_log(msg = "all__find_sim_item_from_a_list cost time:%f, all_sim_item_ids size:%d" % (timer_total.get_diff(), count)) write_log(msg = 'process item_id:%d, rec_items size:%d' % (item_id, len(rec_items)) ) rec_items.sort(lambda y,x : cmp(x[1], y[1])) rec_items = rec_items[0: self.strategy.get_total_rec_num()] final_res = [] for (id, value) in rec_items: final_res.append(id) return final_res
def __find_sim_item_from_a_list(self, target_item, sim_item_ids, idx): target_title = target_item[1] count = 0 rec_items = [] for sim_id in sim_item_ids: sim_item = self.dim_items_index.get(sim_id, -1) if sim_item == -1 : continue count += 1 if count > 20000: break sim_title = sim_item[1] timer = Timer() val = self.__cal_title_sim(target_title, sim_title) write_log(msg = "__cal_title_sim cost time:%f, target_title size:%d, sim_title size:%d" % (timer.get_diff(), len(target_title), len(sim_title))) rec_items.append( (sim_id, val) ) num_to_get = self.strategy.num_to_return_of_this_sim_cat(idx) if len(rec_items) > num_to_get: rec_items.sort(lambda y,x : cmp(x[1], y[1])) rec_items = rec_items[0: num_to_get] return rec_items