def __init__(self, cat1_en_name, is_cat2_brand_reg=True, is_cat1_brand_reg=True): file_sys_obj = PddCat3BrandRegFileTool(cat1_en_name) brand_cat3_recall_file = file_sys_obj.BRAND_CAT3_RECALL_FILE brand_cat2_recall_file = file_sys_obj.BRAND_CAT2_RECALL_FILE brand_cat1_recall_file = file_sys_obj.BRAND_CAT1_RECALL_FILE if not os.path.exists(brand_cat3_recall_file): raise Exception("%s does not exists!" % brand_cat3_recall_file) if not os.path.exists(brand_cat2_recall_file): raise Exception("%s does not exists!" % brand_cat2_recall_file) rule_brand_file = file_sys_obj.RULE_BRAND self.cat1_en_name = cat1_en_name self.is_cat2_brand_reg, self.is_cat1_brand_reg = is_cat2_brand_reg, is_cat1_brand_reg try: self.cat3_ori_brandId_name_dict, self.cat3_ext_brandId_name_dict, \ self.cat3_to_brandId_dict = self._brand_recall_info_loading(brand_cat3_recall_file, cat_level=3) if self.is_cat2_brand_reg: self.cat2_ori_brandId_name_dict, self.cat2_ext_brandId_name_dict, \ self.cat2_to_brandId_dict = self._brand_recall_info_loading(brand_cat2_recall_file, cat_level=2) else: self.cat2_ori_brandId_name_dict, self.cat2_ext_brandId_name_dict, \ self.cat2_to_brandId_dict = {}, {}, {} if self.is_cat1_brand_reg: self.cat1_ori_brandId_name_dict, self.cat1_ext_brandId_name_dict, \ self.cat1_to_brandId_dict = self._brand_recall_info_loading(brand_cat1_recall_file, cat_level=1) else: self.cat1_ori_brandId_name_dict, self.cat1_ext_brandId_name_dict, \ self.cat1_to_brandId_dict = {}, {}, {} self.brand_rule_obj = BrandRefRuleOpt(rule_brand_file) except Exception as e: raise e
def __init__(self, standard_brand_file, del_brand_file=None, exchange_brand_file=None, rule_brand_file=None): if not os.path.exists(standard_brand_file): raise Exception("%s does not exists!" % standard_brand_file) try: self.brand_loading_obj = BrandInfoLoading(standard_brand_file,del_brand_file, exchange_brand_file) #直接读取进行过品牌扩展的文件 self.brand_idx_dict, self.idx_ori_brand_dict, self.name_ori_brand_dict, self.brand_cat1_dict, \ self.cat1_brand_dict, self.cat1_clean_brand_dict, self.brand_gmv_dict, \ self.cat1_dict = self.brand_loading_obj.brand_info_loading() self.brand_rule_obj = BrandRefRuleOpt(rule_brand_file, self.idx_ori_brand_dict) except Exception as e: raise e
class BrandRegTool(object): def __init__(self, standard_brand_file, del_brand_file=None, exchange_brand_file=None, rule_brand_file=None): if not os.path.exists(standard_brand_file): raise Exception("%s does not exists!" % standard_brand_file) try: self.brand_loading_obj = BrandInfoLoading(standard_brand_file, del_brand_file, exchange_brand_file) #直接读取进行过品牌扩展的文件 self.brand_idx_dict, self.idx_ori_brand_dict, self.name_ori_brand_dict, self.brand_cat1_dict, \ self.cat1_brand_dict, self.cat1_clean_brand_dict, self.brand_gmv_dict, \ self.cat1_dict = self.brand_loading_obj.brand_info_loading() if rule_brand_file != None: self.brand_rule_obj = BrandRefRuleOpt(rule_brand_file, self.idx_ori_brand_dict) else: self.brand_rule_obj = None except Exception as e: raise e def english_brand_recognition(self, standard_brand_name, s_name): c_set = {'a':'', 'b':'', 'c':'', 'd':'', 'e':'', 'f':'', 'g':'', 'h':'', \ 'i':'', 'j':'', 'k':'', 'l':'', 'm':'', 'n':'', 'o':'', 'p':'', \ 'q':'', 'r':'', 's':'', 't':'', 'u':'', 'v':'', \ 'w':'', 'x':'', 'y':'', 'z':''} tmp_brand = None if standard_brand_name in s_name: lst1 = s_name.split(standard_brand_name) for tmp in range(1, len(lst1)): pre_str = lst1[tmp - 1] next_str = lst1[tmp] if pre_str == "" or next_str == "": tmp_brand = standard_brand_name break else: a, b = pre_str[-1], next_str[0] if a not in c_set and b not in c_set: tmp_brand = standard_brand_name break else: tmp_brand = None return tmp_brand def getting_high_gmv_brand(self, same_cat1_bid_lst): tmp_gmv_lst = [] for y in same_cat1_bid_lst: tmp_gmv_lst.append( (self.brand_gmv_dict[y], self.name_ori_brand_dict[y], y)) tmp_gmv_lst = sorted(tmp_gmv_lst, key=lambda n: n[0], reverse=True) return tmp_gmv_lst[0][1], tmp_gmv_lst[0][2] def same_cat1_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: #一个品牌两个id的情况 same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: #通过品牌id取到原始品牌名 tmp_name_lst.append(self.name_ori_brand_dict[b]) same_cat1_bid_lst = [] for y in same_id_lst: #通过品牌id取到该品牌的一级类 if cat1_id in self.brand_cat1_dict[ y]: #判断品牌id对应的一级类是否和该sku的一级类一样,如果一样就把该品牌输出 pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y same_cat1_bid_lst.append(y) else: pass if len( same_cat1_bid_lst ) == 1 and pre_brand != None: #匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) # 相同一级类目,相同品牌名称,选择高GMV品牌 elif len(same_cat1_bid_lst ) > 1: #1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的高GMV品牌" pre_brand, pre_brand_id = self.getting_high_gmv_brand( same_cat1_bid_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pass return pre_brand_id, pre_brand, match_type def same_length_strategy(self, clean_brand_lst, clean_brand_id_lst): same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) if len(same_id_lst) == 1: y = same_id_lst[0] pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y match_type = "1:匹配到多个不同品牌(%s),2:选择最长品牌: %s" % ( "|".join(tmp_name_lst), pre_brand) else: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到多个相同最长度品牌名(%s),2:选择相同一级类目的高GMV品牌: %s" % \ ("|".join(tmp_name_lst), pre_brand) return pre_brand_id, pre_brand, match_type def same_cat1_gmv_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) num = 0 for y in same_id_lst: if cat1_id in self.brand_cat1_dict[y]: pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y num += 1 else: pass if num == 1 and pre_brand != None: match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) return pre_brand_id, pre_brand, match_type def brand_inclusion_relation_dealing(self, b_name_lst): del_brand_dict = {} for a in b_name_lst: a1, _ = a.strip().lower().split("|") for b in b_name_lst: b1, _ = b.strip().lower().split("|") if a1 == b1: continue if a1 in b1: del_brand_dict[a] = '' elif b1 in a1: del_brand_dict[b] = '' else: continue r_b_name_lst = [] for c in b_name_lst: if c not in del_brand_dict: r_b_name_lst.append(c) return r_b_name_lst def same_cat1_rule_func(self, b_id_lst, cur_cat1_id): ''' 仅仅保留相同一级类目的品牌 :param b_id_lst: :param cur_cat1_id: :return: ''' r_lst = [] for z in b_id_lst: if z not in self.brand_cat1_dict: continue ok = self.brand_cat1_dict[z] if cur_cat1_id in self.brand_cat1_dict[z]: r_lst.append(z) else: continue return r_lst def error_word(self, s_name): # 古风折扇中国风扇子男女绢扇日本樱花和风扇折叠舞蹈演出扇小扇子 error_word_dict = {} flag = False for k, v in error_word_dict.items(): lst1 = s_name.split(k) if len(lst1) >= 2: flag = True break return flag def rule_opt(self, pre_brand_name_lst, pre_idx_2_brand_dict, \ cur_p_name, cur_cat1_id, cur_cat1_name): r_set = set() r_id_lst = [] stp1_clean_lst = self.brand_rule_obj.co_appear_del_brand_func( pre_brand_name_lst) stp2_clean_lst = self.brand_rule_obj.phone_brand_not_appear_same_fun( stp1_clean_lst) stp3_clean_lst = self.brand_rule_obj.brand_not_appear_same_fun( stp2_clean_lst) for b_name in stp3_clean_lst: tmp_b_name, _ = b_name.split('|') f1 = self.brand_rule_obj.brand_word_rule_func( tmp_b_name, cur_p_name) if f1 == False: continue r_set.add(b_name) r_id_lst += self.brand_idx_dict[b_name] r_id_lst = list(set(r_id_lst)) r_b_id_lst = self.same_cat1_rule_func(r_id_lst, cur_cat1_id) if len(r_b_id_lst) == 0: return [], [] else: r_b_name_lst = [] for xx in r_b_id_lst: if xx not in pre_idx_2_brand_dict: continue r_b_name_lst += pre_idx_2_brand_dict[xx] return r_b_name_lst, r_b_id_lst def getting_cat1_info(self, brand_id): brand_cat1_lst = self.brand_cat1_dict[brand_id] brand_cat1_id = ",".join(brand_cat1_lst) brand_cat1_name_lst = [] for yy in brand_cat1_lst: brand_cat1_name_lst.append(self.cat1_dict[yy]) brand_cat1_name = ",".join(brand_cat1_name_lst) return brand_cat1_id, brand_cat1_name def brand_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 4: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, cat2_id, cat2_name = lst1 # **import cat1_id, cat1_name = cat2_id, cat2_name # s_name小写字母 s_name = tool.s_name_dealing(ori_product_name) if self.brand_rule_obj.no_brand_word_func(s_name): return None, None, None, None, None, None, None # 删除 s_name = self.brand_rule_obj.product_name_del_word_func(s_name) fixed_flag, r_bid = self.brand_rule_obj.fixed_point_func( product_id) if fixed_flag: if r_bid == None: return None, None, None, None, None, None, None else: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( r_bid) return r_bid, self.name_ori_brand_dict[r_bid], "", \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name else: pass clean_brand_2_idx = {} clean_brand_lst = [] # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: for tmp_bid in same_brand_id_lst: clean_brand_lst.append(b_str) clean_brand_2_idx[b_str] = tmp_bid elif b in s_name: for tmp_bid in same_brand_id_lst: clean_brand_lst.append(b_str) clean_brand_2_idx[b_str] = tmp_bid else: continue # 去除已识别品牌中的包含关系 clean_brand_lst = self.brand_inclusion_relation_dealing( clean_brand_lst) clean_idx_2_brand = {} for xxx in clean_brand_lst: x_id = clean_brand_2_idx[xxx] if x_id in clean_brand_2_idx: zzz = clean_idx_2_brand[x_id] clean_idx_2_brand[x_id] = zzz + [xxx] else: clean_idx_2_brand[x_id] = [xxx] if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \ ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: pre_brand_id = clean_brand_id_lst[0] pre_brand = self.name_ori_brand_dict[pre_brand_id] match_type = "匹配到唯一标准品牌" else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: #上方的选择一级类下高GMV品牌策略失效,没有匹配到相同一级类的品牌,则走如下逻辑 pre_brand_id, pre_brand, match_type = self.same_length_strategy( tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id, brand_cat1_name = None, None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e
class BrandRegTool(object): def __init__(self, standard_brand_file, del_brand_file=None, exchange_brand_file=None, rule_brand_file=None): if not os.path.exists(standard_brand_file): raise Exception("%s does not exists!" % standard_brand_file) try: self.brand_loading_obj = BrandInfoLoading(standard_brand_file,del_brand_file, exchange_brand_file) #直接读取进行过品牌扩展的文件 self.brand_idx_dict, self.idx_ori_brand_dict, self.name_ori_brand_dict, self.brand_cat1_dict, \ self.cat1_brand_dict, self.cat1_clean_brand_dict, self.brand_gmv_dict, \ self.cat1_dict = self.brand_loading_obj.brand_info_loading() self.brand_rule_obj = BrandRefRuleOpt(rule_brand_file, self.idx_ori_brand_dict) except Exception as e: raise e def product_name_del_word(self, ori_s_name): del_word_dict = {"网红":'', '美的布谷':'', '珂丝兰珂润万物丝兰完美':'', '淘米家用':'', \ '家用':'', 'OLOMLB':'', 'TESIRIS':'', '文和友演出门票':'', '淘米':'', \ '小米推荐':'', '小米众筹':'', '小米家人':'', '内蒙':'', '牛肉干':'', \ '小米诺':'', '小米琦':'', '小米粒':'', '小米奇':'','小米生态链':'', \ '小米有品同款':'', '小米同款':''} s_name = ori_s_name for k, v in del_word_dict.items(): if k in s_name: s_name = s_name.replace(k, "") return s_name def english_brand_recognition(self, standard_brand_name, s_name): c_set = {'a':'', 'b':'', 'c':'', 'd':'', 'e':'', 'f':'', 'g':'', 'h':'', \ 'i':'', 'j':'', 'k':'', 'l':'', 'm':'', 'n':'', 'o':'', 'p':'', \ 'q':'', 'r':'', 's':'', 't':'', 'u':'', 'v':'', \ 'w':'', 'x':'', 'y':'', 'z':''} tmp_brand = None if standard_brand_name in s_name: lst1 = s_name.split(standard_brand_name) for tmp in range(1, len(lst1)): pre_str = lst1[tmp - 1] next_str = lst1[tmp] if pre_str == "" or next_str == "": tmp_brand = standard_brand_name break else: a, b = pre_str[-1], next_str[0] if a not in c_set and b not in c_set: tmp_brand = standard_brand_name break else: tmp_brand = None return tmp_brand def brand_inclusion_relation_dealing(self, b_name_lst): del_brand_dict = {} for a in b_name_lst: a1, _ = a.strip().lower().split("|") for b in b_name_lst: b1, _ = b.strip().lower().split("|") if a1 == b1: continue if a1 in b1: del_brand_dict[a] = '' elif b1 in a1: del_brand_dict[b] = '' else: continue r_b_name_lst = [] for c in b_name_lst: if c not in del_brand_dict: r_b_name_lst.append(c) return r_b_name_lst def getting_high_gmv_brand(self, same_cat1_bid_lst): tmp_gmv_lst = [] for y in same_cat1_bid_lst: tmp_gmv_lst.append((self.brand_gmv_dict[y], self.name_ori_brand_dict[y], y)) tmp_gmv_lst = sorted(tmp_gmv_lst, key=lambda n: n[0], reverse=True) return tmp_gmv_lst[0][1], tmp_gmv_lst[0][2] def same_cat1_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) same_cat1_bid_lst = [] for y in same_id_lst: if cat1_id in self.brand_cat1_dict[y]: pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y same_cat1_bid_lst.append(y) else: pass if len(same_cat1_bid_lst) == 1 and pre_brand != None: match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) # 相同一级类目,相同品牌名称,选择高GMV品牌 elif len(same_cat1_bid_lst) > 1: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_cat1_bid_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pass return pre_brand_id, pre_brand, match_type def same_length_strategy(self, clean_brand_lst, clean_brand_id_lst): same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) if len(same_id_lst) == 1: y = same_id_lst[0] pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y match_type = "1:匹配到多个不同品牌(%s),2:选择最长品牌: %s" % ("|".join(tmp_name_lst), pre_brand) else: pre_brand,pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到多个相同最长度品牌名(%s),2:选择相同一级类目的高GMV品牌: %s" % \ ("|".join(tmp_name_lst), pre_brand) return pre_brand_id, pre_brand, match_type def same_cat1_gmv_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) num = 0 for y in same_id_lst: if cat1_id in self.brand_cat1_dict[y]: pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y num += 1 else: pass if num == 1 and pre_brand != None: match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) return pre_brand_id, pre_brand, match_type def numerous_cat1_rule_func(self, b_id_lst): d1 = {} for z in b_id_lst: if z not in self.brand_cat1_dict: continue for y in self.brand_cat1_dict[z]: if y in d1: x = d1[y] d1[y] = x + 1 else: d1[y] = 1 lst1 = [(k, v) for k,v in d1.items()] lst1 = sorted(lst1, key=lambda x: x[1], reverse=True) c_id_1, c_num_1 = lst1[0] if c_num_1 >= 3: return False ''' if len(lst1) >= 2: c_id_2, c_num_2 = lst1[1] if c_num_2 >=2 and c_num_1 >= 2: return False ''' return True # def rule_opt(self, pre_brand_name_lst, cur_p_name, cur_cat1_id, cur_cat1_name,pre_clean_brand_id_lst): # r_set = set() # r_id_lst = [] # tmp_name_list = [] # r_name_list = [] # # stp0_clean_lst = self.brand_rule_obj.phone_brand_not_appear_same_fun(pre_clean_brand_id_lst, cur_cat1_id) # # for tmp in list(stp0_clean_lst): # if tmp not in self.idx_ori_brand_dict: continue # tmp_name_list = tmp_name_list + self.brand_rule_obj.bid_2_bname(tmp) # for b_name in tmp_name_list: # b_name = b_name.lower() # is_eng = tool.is_all_eng(b_name) # flag = "0" if is_eng else "1" # r_name_list.append("%s|%s" % (b_name, flag)) # # stp1_clean_lst = self.brand_inclusion_relation_dealing(r_name_list) # stp2_clean_lst = self.brand_rule_obj.co_appear_del_brand_func(stp1_clean_lst) # stp3_clean_lst = self.brand_rule_obj.apppint_co_appear_del_brand_func(stp2_clean_lst) # # stp2_clean_lst = self.brand_rule_obj.phone_brand_not_appear_same_fun(stp1_clean_lst, cur_cat1_id) # stp4_clean_lst = self.brand_rule_obj.brand_not_appear_same_fun(stp3_clean_lst) # for b_name in stp4_clean_lst: # tmp_b_name, _ = b_name.split('|') # # 修正过程。如“苹果手机支架”不为“苹果”手机,此时将“苹果”从候选列表中移除 # f1 = self.brand_rule_obj.brand_word_rule_func(tmp_b_name, cur_p_name) # if f1 == False: continue # f2 = self.brand_rule_obj.brand_cat1_rule_func(tmp_b_name, cur_cat1_name) # if f2 == False: continue # f3 = self.brand_rule_obj.brand_cat1_fixed_rule_func(tmp_b_name, cur_cat1_id) # if f3 == False: continue # r_set.add(b_name) # r_id_lst += self.brand_idx_dict[b_name] # r_b_name_lst, r_b_id_lst = list(r_set), list(set(r_id_lst)) # if len(r_b_id_lst) == 0: # return [], [] # if not self.numerous_cat1_rule_func(r_b_id_lst): # return [], [] # else: # return r_b_name_lst, r_b_id_lst def rule_opt(self, pre_brand_name_lst, cur_p_name, cur_cat1_id, cur_cat1_name): r_set = set() r_id_lst = [] stp0_clean_lst = self.brand_inclusion_relation_dealing(pre_brand_name_lst) stp1_clean_lst = self.brand_rule_obj.co_appear_del_brand_func(stp0_clean_lst) stp2_clean_lst = self.brand_rule_obj.apppint_co_appear_del_brand_func(stp1_clean_lst) stp3_clean_lst = self.brand_rule_obj.phone_brand_not_appear_same_fun(stp2_clean_lst, cur_cat1_id) stp4_clean_lst = self.brand_rule_obj.brand_not_appear_same_fun(stp3_clean_lst) for b_name in stp4_clean_lst: tmp_b_name, _ = b_name.split('|') # 修正过程。如“苹果手机支架”不为“苹果”手机,此时将“苹果”从候选列表中移除 f1 = self.brand_rule_obj.brand_word_rule_func(tmp_b_name, cur_p_name) if f1 == False: continue f2 = self.brand_rule_obj.brand_cat1_rule_func(tmp_b_name, cur_cat1_name) if f2 == False: continue f3 = self.brand_rule_obj.brand_cat1_fixed_rule_func(tmp_b_name, cur_cat1_id) if f3 == False: continue r_set.add(b_name) r_id_lst += self.brand_idx_dict[b_name] r_b_name_lst, r_b_id_lst = list(r_set), list(set(r_id_lst)) if len(r_b_id_lst) == 0: return [], [] if not self.numerous_cat1_rule_func(r_b_id_lst): return [], [] else: return r_b_name_lst, r_b_id_lst def getting_cat1_info(self, brand_id): brand_cat1_lst = self.brand_cat1_dict[brand_id] brand_cat1_id = ",".join(brand_cat1_lst) brand_cat1_name_lst = [] for yy in brand_cat1_lst: brand_cat1_name_lst.append(self.cat1_dict[yy]) brand_cat1_name = ",".join(brand_cat1_name_lst) return brand_cat1_id, brand_cat1_name def brand_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1 if product_id in self.brand_rule_obj.appoint_product_clean_dict: brand_id = self.brand_rule_obj.appoint_product_clean_dict[product_id] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(brand_id) return brand_id, self.name_ori_brand_dict[brand_id], "匹配到唯一标准品牌", \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name # s_name小写字母 s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word)) s_name = self.product_name_del_word(s_name) clean_brand_lst = [] clean_brand_id_lst = [] # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst elif b in s_name: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst else: continue clean_brand_lst = list(set(clean_brand_lst)) clean_brand_id_lst = list(set(clean_brand_id_lst)) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: pre_brand_id, pre_brand, match_type = self.same_length_strategy(tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info(pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e def brand_same_cat1_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1 # s_name小写字母 s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word)) s_name = self.product_name_del_word(s_name) clean_brand_lst = [] clean_brand_id_lst = [] # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst elif b in s_name: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst else: continue clean_brand_lst = list(set(clean_brand_lst)) clean_brand_id_lst = list(set(clean_brand_id_lst)) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break # 兜底处理 if pre_brand == None and match_type == None: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info(pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e
class BrandRegTool(object): def __init__(self, standard_brand_file, del_brand_file=None, exchange_brand_file=None, rule_brand_file=None): if not os.path.exists(standard_brand_file): raise Exception("%s does not exists!" % standard_brand_file) try: self.brand_loading_obj = BrandInfoLoading(standard_brand_file, del_brand_file, exchange_brand_file) #直接读取进行过品牌扩展的文件 self.brand_idx_dict, self.idx_ori_brand_dict, self.name_ori_brand_dict, self.brand_cat1_dict, \ self.cat1_brand_dict, self.cat1_clean_brand_dict, self.brand_gmv_dict, \ self.cat1_dict = self.brand_loading_obj.brand_info_loading() self.brand_rule_obj = BrandRefRuleOpt(rule_brand_file, self.idx_ori_brand_dict) except Exception as e: raise e def english_brand_recognition(self, standard_brand_name, s_name): c_set = {'a':'', 'b':'', 'c':'', 'd':'', 'e':'', 'f':'', 'g':'', 'h':'', \ 'i':'', 'j':'', 'k':'', 'l':'', 'm':'', 'n':'', 'o':'', 'p':'', \ 'q':'', 'r':'', 's':'', 't':'', 'u':'', 'v':'', \ 'w':'', 'x':'', 'y':'', 'z':''} tmp_brand = None if standard_brand_name in s_name: lst1 = s_name.split(standard_brand_name) for tmp in range(1, len(lst1)): pre_str = lst1[tmp - 1] next_str = lst1[tmp] if pre_str == "" or next_str == "": tmp_brand = standard_brand_name break else: a, b = pre_str[-1], next_str[0] if a not in c_set and b not in c_set: tmp_brand = standard_brand_name break else: tmp_brand = None return tmp_brand def getting_high_gmv_brand(self, same_cat1_bid_lst): tmp_gmv_lst = [] for y in same_cat1_bid_lst: tmp_gmv_lst.append( (self.brand_gmv_dict[y], self.name_ori_brand_dict[y], y)) tmp_gmv_lst = sorted(tmp_gmv_lst, key=lambda n: n[0], reverse=True) return tmp_gmv_lst[0][1], tmp_gmv_lst[0][2] def same_cat1_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) same_cat1_bid_lst = [] for y in same_id_lst: if cat1_id in self.brand_cat1_dict[y]: pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y same_cat1_bid_lst.append(y) else: pass if len(same_cat1_bid_lst) == 1 and pre_brand != None: match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) # 相同一级类目,相同品牌名称,选择高GMV品牌 elif len(same_cat1_bid_lst) > 1: pre_brand, pre_brand_id = self.getting_high_gmv_brand( same_cat1_bid_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pass return pre_brand_id, pre_brand, match_type def same_length_strategy(self, clean_brand_lst, clean_brand_id_lst): same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) if len(same_id_lst) == 1: y = same_id_lst[0] pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y match_type = "1:匹配到多个不同品牌(%s),2:选择最长品牌: %s" % ( "|".join(tmp_name_lst), pre_brand) else: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到多个相同最长度品牌名(%s),2:选择相同一级类目的高GMV品牌: %s" % \ ("|".join(tmp_name_lst), pre_brand) return pre_brand_id, pre_brand, match_type def same_cat1_gmv_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) num = 0 for y in same_id_lst: if cat1_id in self.brand_cat1_dict[y]: pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y num += 1 else: pass if num == 1 and pre_brand != None: match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) return pre_brand_id, pre_brand, match_type def rule_opt(self, pre_brand_name_lst, cur_p_name, cur_cat1_id, cur_cat1_name): r_set = set() r_id_lst = [] stp1_clean_lst = self.brand_rule_obj.phone_brand_not_appear_same_fun( pre_brand_name_lst, cur_cat1_id) stp2_clean_lst = self.brand_rule_obj.brand_not_appear_same_fun( stp1_clean_lst) for b_name in stp2_clean_lst: tmp_b_name, _ = b_name.split('|') # 修正过程。如“苹果手机支架”不为“苹果”手机,此时将“苹果”从候选列表中移除 f1 = self.brand_rule_obj.brand_word_rule_func( tmp_b_name, cur_p_name) if f1 == False: continue f2 = self.brand_rule_obj.brand_cat1_rule_func( tmp_b_name, cur_cat1_name) if f2 == False: continue f3 = self.brand_rule_obj.brand_cat1_fixed_rule_func( tmp_b_name, cur_cat1_id) if f3 == False: continue r_set.add(b_name) r_id_lst += self.brand_idx_dict[b_name] return list(r_set), list(set(r_id_lst)) def getting_cat1_info(self, brand_id): brand_cat1_lst = self.brand_cat1_dict[brand_id] brand_cat1_id = ",".join(brand_cat1_lst) brand_cat1_name_lst = [] for yy in brand_cat1_lst: brand_cat1_name_lst.append(self.cat1_dict[yy]) brand_cat1_name = ",".join(brand_cat1_name_lst) return brand_cat1_id, brand_cat1_name def brand_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1 # s_name小写字母 s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word)) clean_brand_lst = [] clean_brand_id_lst = [] # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() if b_str == "colorkey": ok = 1 lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst elif b in s_name: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst else: continue clean_brand_lst = list(set(clean_brand_lst)) clean_brand_id_lst = list(set(clean_brand_id_lst)) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt( clean_brand_lst, ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: pre_brand_id, pre_brand, match_type = self.same_length_strategy( tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e def brand_same_cat1_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1 # s_name小写字母 s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word)) clean_brand_lst = [] clean_brand_id_lst = [] # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst elif b in s_name: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst else: continue clean_brand_lst = list(set(clean_brand_lst)) clean_brand_id_lst = list(set(clean_brand_id_lst)) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt( clean_brand_lst, ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break # 兜底处理 if pre_brand == None and match_type == None: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e
class BrandRegTool(object): def __init__(self, standard_brand_file, del_brand_file=None, exchange_brand_file=None, rule_brand_file=None): if not os.path.exists(standard_brand_file): raise Exception("%s does not exists!" % standard_brand_file) try: self.brand_loading_obj = BrandInfoLoading(standard_brand_file, del_brand_file, exchange_brand_file) #直接读取进行过品牌扩展的文件 self.brand_idx_dict, self.idx_ori_brand_dict, self.name_ori_brand_dict, self.brand_cat1_dict, \ self.cat1_brand_dict, self.cat1_clean_brand_dict, self.brand_gmv_dict, \ self.cat1_dict = self.brand_loading_obj.brand_info_loading() self.brand_rule_obj = BrandRefRuleOpt(rule_brand_file, self.idx_ori_brand_dict) except Exception as e: raise e def product_name_del_word(self, ori_s_name): del_word_dict = {"网红":'', '美的布谷':'', '珂丝兰珂润万物丝兰完美':'', '淘米家用':'', \ '家用':'', 'OLOMLB':'', 'TESIRIS':'', '文和友演出门票':'', '淘米':'', \ '小米推荐':'', '小米众筹':'', '小米家人':'', '内蒙':'', '牛肉干':'', \ '小米诺':'', '小米琦':'', '小米粒':'', '小米奇':'','小米生态链':'', \ '小米有品同款':'', '小米同款':'', '正善牛肉哥':'', '东北大板栗':'', \ '黄小米':'', '大红印':'', '森马电商':'', '老板说了':'', '老板精选': ''} s_name = ori_s_name for k, v in del_word_dict.items(): if k in s_name: s_name = s_name.replace(k, "") return s_name def english_brand_recognition(self, standard_brand_name, s_name): c_set = {'a':'', 'b':'', 'c':'', 'd':'', 'e':'', 'f':'', 'g':'', 'h':'', \ 'i':'', 'j':'', 'k':'', 'l':'', 'm':'', 'n':'', 'o':'', 'p':'', \ 'q':'', 'r':'', 's':'', 't':'', 'u':'', 'v':'', \ 'w':'', 'x':'', 'y':'', 'z':''} tmp_brand = None if standard_brand_name in s_name: lst1 = s_name.split(standard_brand_name) for tmp in range(1, len(lst1)): pre_str = lst1[tmp - 1] next_str = lst1[tmp] if pre_str == "" or next_str == "": tmp_brand = standard_brand_name break else: a, b = pre_str[-1], next_str[0] if a not in c_set and b not in c_set: tmp_brand = standard_brand_name break else: tmp_brand = None return tmp_brand def brand_inclusion_relation_dealing(self, b_name_lst): del_brand_dict = {} for a in b_name_lst: a1, _ = a.strip().lower().split("|") for b in b_name_lst: b1, _ = b.strip().lower().split("|") if a1 == b1: continue if a1 in b1: del_brand_dict[a] = '' elif b1 in a1: del_brand_dict[b] = '' else: continue r_b_name_lst = [] for c in b_name_lst: if c not in del_brand_dict: r_b_name_lst.append(c) return r_b_name_lst def getting_high_gmv_brand(self, same_cat1_bid_lst): tmp_gmv_lst = [] for y in same_cat1_bid_lst: tmp_gmv_lst.append( (self.brand_gmv_dict[y], self.name_ori_brand_dict[y], y)) tmp_gmv_lst = sorted(tmp_gmv_lst, key=lambda n: n[0], reverse=True) return tmp_gmv_lst[0][1], tmp_gmv_lst[0][2] def same_cat1_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) same_cat1_bid_lst = [] for y in same_id_lst: if cat1_id in self.brand_cat1_dict[y]: pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y same_cat1_bid_lst.append(y) else: pass if len(same_cat1_bid_lst) == 1 and pre_brand != None: match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) # 相同一级类目,相同品牌名称,选择高GMV品牌 elif len(same_cat1_bid_lst) > 1: pre_brand, pre_brand_id = self.getting_high_gmv_brand( same_cat1_bid_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pass return pre_brand_id, pre_brand, match_type def same_length_strategy(self, clean_brand_lst, clean_brand_id_lst): same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) if len(same_id_lst) == 1: y = same_id_lst[0] pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y match_type = "1:匹配到多个不同品牌(%s),2:选择最长品牌: %s" % ( "|".join(tmp_name_lst), pre_brand) else: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到多个相同最长度品牌名(%s),2:选择相同一级类目的高GMV品牌: %s" % \ ("|".join(tmp_name_lst), pre_brand) return pre_brand_id, pre_brand, match_type def same_cat1_gmv_strategy(self, cat1_id, clean_brand_lst, clean_brand_id_lst): pre_brand_id, pre_brand, match_type = None, None, None same_id_lst = [] for a in clean_brand_lst: same_id_lst += self.brand_idx_dict[a] tmp_name_lst = [] for b in clean_brand_id_lst: tmp_name_lst.append(self.name_ori_brand_dict[b]) num = 0 for y in same_id_lst: if cat1_id in self.brand_cat1_dict[y]: pre_brand = self.name_ori_brand_dict[y] pre_brand_id = y num += 1 else: pass if num == 1 and pre_brand != None: match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择相同一级类目的品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) else: pre_brand, pre_brand_id = self.getting_high_gmv_brand(same_id_lst) match_type = "1:匹配到相同品牌名(%s)多个不同品牌(%s),2:选择高GMV品牌" % \ (clean_brand_lst[0], "|".join(tmp_name_lst)) return pre_brand_id, pre_brand, match_type def same_cat1_rule_func(self, b_id_lst, cur_cat1_id): ''' 仅仅保留相同一级类目的品牌 :param b_id_lst: :param cur_cat1_id: :return: ''' r_lst = [] for z in b_id_lst: if z not in self.brand_cat1_dict: continue ok = self.brand_cat1_dict[z] if cur_cat1_id in self.brand_cat1_dict[z]: r_lst.append(z) else: continue return r_lst def rule_opt(self, pre_brand_name_lst, pre_idx_2_brand_dict,\ cur_p_name, cur_cat1_id, cur_cat1_name): r_set = set() r_id_lst = [] stp2_clean_lst = self.brand_rule_obj.co_appear_del_brand_func( pre_brand_name_lst) stp3_clean_lst = self.brand_rule_obj.apppint_co_appear_del_brand_func( stp2_clean_lst) stp4_clean_lst = self.brand_rule_obj.brand_not_appear_same_fun( stp3_clean_lst) # 有问题 for b_name in stp4_clean_lst: tmp_b_name, _ = b_name.split('|') # 修正过程。如“苹果手机支架”不为“苹果”手机,此时将“苹果”从候选列表中移除 f1 = self.brand_rule_obj.brand_word_rule_func( tmp_b_name, cur_p_name, cur_cat1_id) if f1 == False: continue f2 = self.brand_rule_obj.brand_cat1_rule_func( tmp_b_name, cur_cat1_name) if f2 == False: continue f3 = self.brand_rule_obj.brand_cat1_fixed_rule_func( tmp_b_name, cur_cat1_id) if f3 == False: continue r_set.add(b_name) r_id_lst += self.brand_idx_dict[b_name] r_b_name_lst, r_b_id_lst = list(r_set), list(set(r_id_lst)) if len(r_b_id_lst) == 0: return [], [] # 人工强制规定一个品牌规则 r_b_name_lst, r_b_id_lst = self.manual_compellent_assign_brand_rule( r_b_name_lst, r_b_id_lst) r_b_id_lst = self.same_cat1_rule_func(r_b_id_lst, cur_cat1_id) r_b_id_lst = self.brand_rule_obj.phone_brand_not_appear_same_fun( r_b_id_lst, cur_cat1_id) r_b_name_lst = [] for xx in r_b_id_lst: if xx not in pre_idx_2_brand_dict: continue r_b_name_lst += pre_idx_2_brand_dict[xx] if len(r_b_id_lst) == 0: return [], [] else: r_b_name_lst = [] for xx in r_b_id_lst: if xx not in pre_idx_2_brand_dict: continue r_b_name_lst += pre_idx_2_brand_dict[xx] return r_b_name_lst, r_b_id_lst def getting_cat1_info(self, brand_id): brand_cat1_lst = self.brand_cat1_dict[brand_id] brand_cat1_id = ",".join(brand_cat1_lst) brand_cat1_name_lst = [] for yy in brand_cat1_lst: brand_cat1_name_lst.append(self.cat1_dict[yy]) brand_cat1_name = ",".join(brand_cat1_name_lst) return brand_cat1_id, brand_cat1_name def manual_compellent_assign_brand_rule(self, r_b_name_lst, r_b_id_lst): ''' 解决了 【资生堂】【安热沙】【安耐晒】三个同时出现是,品牌为:【安热沙】 例如:3428308113219031576 解决了 【资生堂】【安热沙】两个同时出现时,品牌为:【安热沙】 例如:3429766112856943278 解决了 【小米有品】【麦瑞克】两个同时出现时,品牌为:【麦瑞克】 例如:3429256598957224027,3432639012387570550 ''' if '10266841' in r_b_id_lst and '12661143' in r_b_id_lst and '10756319' in r_b_id_lst and len( r_b_id_lst) == 3: return ['安热沙|0'], ['10266841'] if '10266841' in r_b_id_lst and '10756319' in r_b_id_lst and len( r_b_id_lst) == 2: return ['安热沙|0'], ['10266841'] elif '10698337' in r_b_id_lst and '10032446' in r_b_id_lst and len( r_b_id_lst) == 2: return ['麦瑞克|0'], ['10032446'] else: return r_b_name_lst, r_b_id_lst def brand_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, _, cat1_id, cat1_name = lst1 # s_name小写字母 s_name = tool.s_name_dealing(ori_product_name) s_name = self.product_name_del_word(s_name) clean_brand_2_idx = {} clean_brand_lst = [] # 英语手机品牌清洗,比如:vivo,oppo等的清洗 englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func( s_name, cat1_id) for epb in englisg_phone_brand_lst: epb_id, epb_name = epb epb_name = epb_name + '|0' if epb_name not in self.brand_idx_dict: continue clean_brand_2_idx[epb_name] = epb_id clean_brand_lst.append(epb_name) # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: for tmp_bid in same_brand_id_lst: clean_brand_2_idx[ori_b_str] = tmp_bid clean_brand_lst.append(ori_b_str) elif b in s_name: for tmp_bid in same_brand_id_lst: clean_brand_2_idx[ori_b_str] = tmp_bid clean_brand_lst.append(ori_b_str) else: continue # 去除已识别品牌中的包含关系 clean_brand_lst = self.brand_inclusion_relation_dealing( clean_brand_lst) clean_idx_2_brand = {} for xxx in clean_brand_lst: x_id = clean_brand_2_idx[xxx] if x_id in clean_brand_2_idx: zzz = clean_idx_2_brand[x_id] clean_idx_2_brand[x_id] = zzz + [xxx] else: clean_idx_2_brand[x_id] = [xxx] if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \ ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: pre_brand_id, pre_brand, match_type = self.same_length_strategy( tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e def brand_reg_opt(self, reg_brand_dict, line_str): try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1 # # s_name小写字母 s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word)) s_name = self.product_name_del_word(s_name) clean_idx_2_brand = {} # 英语手机品牌清洗,比如:vivo,oppo等的清洗 englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func( s_name, cat1_id) for epb in englisg_phone_brand_lst: epb_id, epb_name = epb epb_name = epb_name + '|0' if epb_name not in self.brand_idx_dict: continue clean_idx_2_brand[epb_id] = epb_name # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in reg_brand_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: for tmp_bid in same_brand_id_lst: clean_idx_2_brand[tmp_bid] = ori_b_str elif b in s_name: for tmp_bid in same_brand_id_lst: clean_idx_2_brand[tmp_bid] = ori_b_str else: continue clean_brand_lst = list(set(list(clean_idx_2_brand.values()))) clean_brand_id_lst = list(set(list(clean_idx_2_brand.keys()))) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt( clean_brand_id_lst, clean_idx_2_brand, ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: pre_brand_id, pre_brand, match_type = self.same_length_strategy( tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e def brand_same_cat1_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, ori_cat1_id, ori_cat1_name = lst1 pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name = None, None, None, None, None, None, None reg_brand_dict = {} if ori_cat1_id in self.cat1_clean_brand_dict: brand_lst = self.cat1_clean_brand_dict[ori_cat1_id] for tmp_b in brand_lst: if tmp_b not in self.brand_idx_dict: continue reg_brand_dict[tmp_b] = self.brand_idx_dict[tmp_b] # pre_brand_id, pre_brand, match_type, brand_cat1_id, brand_cat1_name, \ cat1_id, cat1_name = self.brand_reg_opt(reg_brand_dict, line) if pre_brand == None or pre_brand_id == None: pre_brand_id, pre_brand, match_type, brand_cat1_id, brand_cat1_name, \ cat1_id, cat1_name = self.brand_reg_opt(self.brand_idx_dict, line) return pre_brand_id, pre_brand, match_type, brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e
class BrandRegTool(object): def __init__(self, cat1_en_name, is_cat2_brand_reg=True, is_cat1_brand_reg=True): file_sys_obj = PddCat3BrandRegFileTool(cat1_en_name) brand_cat3_recall_file = file_sys_obj.BRAND_CAT3_RECALL_FILE brand_cat2_recall_file = file_sys_obj.BRAND_CAT2_RECALL_FILE brand_cat1_recall_file = file_sys_obj.BRAND_CAT1_RECALL_FILE if not os.path.exists(brand_cat3_recall_file): raise Exception("%s does not exists!" % brand_cat3_recall_file) if not os.path.exists(brand_cat2_recall_file): raise Exception("%s does not exists!" % brand_cat2_recall_file) rule_brand_file = file_sys_obj.RULE_BRAND self.cat1_en_name = cat1_en_name self.is_cat2_brand_reg, self.is_cat1_brand_reg = is_cat2_brand_reg, is_cat1_brand_reg try: self.cat3_ori_brandId_name_dict, self.cat3_ext_brandId_name_dict, \ self.cat3_to_brandId_dict = self._brand_recall_info_loading(brand_cat3_recall_file, cat_level=3) if self.is_cat2_brand_reg: self.cat2_ori_brandId_name_dict, self.cat2_ext_brandId_name_dict, \ self.cat2_to_brandId_dict = self._brand_recall_info_loading(brand_cat2_recall_file, cat_level=2) else: self.cat2_ori_brandId_name_dict, self.cat2_ext_brandId_name_dict, \ self.cat2_to_brandId_dict = {}, {}, {} if self.is_cat1_brand_reg: self.cat1_ori_brandId_name_dict, self.cat1_ext_brandId_name_dict, \ self.cat1_to_brandId_dict = self._brand_recall_info_loading(brand_cat1_recall_file, cat_level=1) else: self.cat1_ori_brandId_name_dict, self.cat1_ext_brandId_name_dict, \ self.cat1_to_brandId_dict = {}, {}, {} self.brand_rule_obj = BrandRefRuleOpt(rule_brand_file) except Exception as e: raise e def _brand_recall_info_loading(self, brand_recall_file, cat_level=3): ori_brandId_name_dict = {} # brand_id: brand_name ext_brandId_name_dict = {} # brand_id: ext_brand_name cat_2_brandId_dict = {} # cat3_name: [bid1, bid2, ...] with open(brand_recall_file,"r",encoding="utf-8") as f1: for line in f1: line = line.strip() if line == "": continue if line.startswith("#"): continue # brand_id, brand_ext_name, brand_name, cat1_name, cat2_name, cat3_name lst1 = line.split("\t") if len(lst1) != 6: continue lst1 = [tmp.strip() for tmp in lst1] b_id, b_name_ext, b_name_ori, cat1_name, cat2_name, cat3_name = lst1 ori_brandId_name_dict[b_id] = b_name_ori ext_brandId_name_dict[b_id] = b_name_ext if cat_level == 3: cat_key = cat3_name elif cat_level == 2: cat_key = cat2_name elif cat_level == 1: cat_key = cat1_name else: continue if cat_key in cat_2_brandId_dict: z = cat_2_brandId_dict[cat_key] z = list(set([b_id] + z)) cat_2_brandId_dict[cat_key] = z else: cat_2_brandId_dict[cat_key] = [b_id] return ori_brandId_name_dict, ext_brandId_name_dict, cat_2_brandId_dict, def english_brand_recognition(self, standard_brand_name, s_name): c_set = {'a':'', 'b':'', 'c':'', 'd':'', 'e':'', 'f':'', 'g':'', 'h':'', \ 'i':'', 'j':'', 'k':'', 'l':'', 'm':'', 'n':'', 'o':'', 'p':'', \ 'q':'', 'r':'', 's':'', 't':'', 'u':'', 'v':'', \ 'w':'', 'x':'', 'y':'', 'z':''} standard_brand_name = standard_brand_name.strip() if standard_brand_name == "": return None tmp_brand = None if standard_brand_name in s_name: lst1 = s_name.split(standard_brand_name) for tmp in range(1, len(lst1)): pre_str = lst1[tmp - 1] next_str = lst1[tmp] if pre_str == "" or next_str == "": tmp_brand = standard_brand_name break else: a, b = pre_str[-1], next_str[0] if a not in c_set and b not in c_set: tmp_brand = standard_brand_name break else: tmp_brand = None return tmp_brand def brand_inclusion_relation_dealing(self, b_name_lst): del_brand_dict = {} for a in b_name_lst: a1, _ = a.strip().lower().split("|") for b in b_name_lst: b1, _ = b.strip().lower().split("|") if a1 == b1: continue if a1 in b1: del_brand_dict[a] = '' elif b1 in a1: del_brand_dict[b] = '' else: continue r_b_name_lst = [] for c in b_name_lst: if c not in del_brand_dict: r_b_name_lst.append(c) return r_b_name_lst def _ext_name_sorted(self, ext_bname): b_lst = [] for tmp in ext_bname.strip().split('/'): b_lst.append((len(tmp), tmp)) b_lst = sorted(b_lst, key=lambda x: x[0], reverse=True) rb_lst = [y[1] for y in b_lst] return rb_lst def shoujipeijian_rule_opt(self, p_name, opt_set, cat3_ch_name): bid_set = set() for z1 in opt_set: r_bname, r_bid = z1 bid_set.add(r_bid) re_list = self.brand_rule_obj.phone_brand_not_appear_same_fun(list(bid_set), self.cat1_en_name) opt1_set = set() for z2 in opt_set: r_bname, r_bid = z2 if r_bid in re_list: opt1_set.add(z2) opt2_set = set() for z1 in list(opt1_set): tmp_bname, tmp_bid = z1 if self.brand_rule_obj.phone_brand_word_rule_func(tmp_bid, cat3_ch_name, p_name): opt2_set.add(z1) return opt2_set def rule_opt(self, p_name, pair_tuple_lst, cat3_ch_name): stp1_pair_tuple_lst = self.brand_rule_obj.co_appear_del_brand_func(pair_tuple_lst) if len(stp1_pair_tuple_lst) == 0: return [] tmp_lst = self.brand_rule_obj.co_appear_del_brand_func(pair_tuple_lst) opt0_set = set(tmp_lst) if len(opt0_set) == 0: return [] # opt1_set = set() for itm in stp1_pair_tuple_lst: r_bname, r_bid = itm f1 = self.brand_rule_obj.laoban_brand_rule_func(p_name, r_bname) if f1: continue opt1_set.add(itm) if len(opt1_set) == 0: return [] tmp_del_dict = {} for xx in opt1_set: _, id1 = xx for yy in opt1_set: _, id2 = yy if id1 == id2: continue tmp_id = self.brand_rule_obj.mainBrand_appear_simultaneously_with_subBrand_func(id1, id2) if tmp_id == id1: tmp_del_dict[id2] = '' elif tmp_id == id2: tmp_del_dict[id1] = '' else: continue opt2_set = set() for zz in opt1_set: _, tmp_id = zz if tmp_id in tmp_del_dict: continue else: opt2_set.add(zz) # 手机配件规则 if self.cat1_en_name == "shoujipeijian": r_set = self.shoujipeijian_rule_opt(p_name, opt2_set, cat3_ch_name) else: r_set = opt2_set return list(r_set) def brand_reg(self, s_name, brand_lst, brand_id): r_lst = [] for ext_bname in brand_lst: if tool.is_all_eng(ext_bname) and tool.is_own_eng(s_name) and ext_bname in s_name: en_reg_bname = self.english_brand_recognition(ext_bname, s_name) if en_reg_bname != None: reg_bname = en_reg_bname else: continue elif ext_bname in s_name: reg_bname = ext_bname else: continue r_lst.append((reg_bname, brand_id)) #rule_opt_lst = self.rule_opt(s_name, r_lst) #return rule_opt_lst return r_lst def _multi_brand_opt(self, reg_bname_lst): length_reg_brand_dict = {} max_length = -1 for xx in reg_bname_lst: tmp_l = len(xx[0]) if tmp_l >= max_length: max_length = tmp_l if tmp_l in length_reg_brand_dict: yy = length_reg_brand_dict[tmp_l] yy = [xx] + yy length_reg_brand_dict[tmp_l] = yy else: length_reg_brand_dict[tmp_l] = [xx] final_bname_lst = list(set(length_reg_brand_dict[max_length])) if len(final_bname_lst) == 1: reg_bname, reg_bid = final_bname_lst[0] else: tmp_lst = [] for zz in final_bname_lst: tmp_lst.append("%s|%s" % (zz[0], zz[1])) reg_bid, reg_bname = "", "#".join(tmp_lst) return reg_bname, reg_bid def brand_reg_main(self, ori_product_name, cat1_name, cat2_name, cat3_name): try: ori_product_name, cat3_name = ori_product_name.strip(), cat3_name.strip() if ori_product_name == "": raise Exception("ori_product_name is empty!") if cat3_name == '': raise Exception("cat3_name is empty!") if cat3_name not in self.cat3_to_brandId_dict: raise Exception("%s is not in self.cat3_to_brandId_dict" % cat3_name) cat3_ext_name = {} for bid in self.cat3_to_brandId_dict[cat3_name]: if bid not in self.cat3_ori_brandId_name_dict or \ bid not in self.cat3_ext_brandId_name_dict: continue cat3_ext_name[bid] = self._ext_name_sorted(self.cat3_ext_brandId_name_dict[bid]) s_name = tool.s_name_dealing(ori_product_name) # 删除 s_name = self.brand_rule_obj.product_name_del_word_func(s_name) # "dirty words" in product name if self.brand_rule_obj.no_brand_word_func(s_name): return None, None, None # 三级类目品牌识别 reg_bname_lst = [] for k, v in cat3_ext_name.items(): reg_bname_lst += self.brand_reg(s_name, v, k) # 二级类目品牌识别 if len(reg_bname_lst) == 0 and self.is_cat2_brand_reg: cat2_ext_name = {} for bid in self.cat2_to_brandId_dict[cat2_name]: if bid not in self.cat2_ori_brandId_name_dict or \ bid not in self.cat2_ext_brandId_name_dict: continue cat2_ext_name[bid] = self._ext_name_sorted(self.cat2_ext_brandId_name_dict[bid]) for k, v in cat2_ext_name.items(): reg_bname_lst += self.brand_reg(s_name, v, k) # 一级类目品牌识别 if len(reg_bname_lst) == 0 and self.is_cat1_brand_reg: cat1_ext_name = {} for bid in self.cat1_to_brandId_dict[cat1_name]: if bid not in self.cat1_ori_brandId_name_dict or \ bid not in self.cat1_ext_brandId_name_dict: continue cat1_ext_name[bid] = self._ext_name_sorted(self.cat1_ext_brandId_name_dict[bid]) for k, v in cat1_ext_name.items(): reg_bname_lst += self.brand_reg(s_name, v, k) if len(reg_bname_lst) == 0: return None, None, None rule_opt_lst = self.rule_opt(s_name, reg_bname_lst, cat3_name) brand_id_set = set() for zz in rule_opt_lst: brand_id_set.add(zz[1]) if len(brand_id_set) == 0: return None, None, None if len(brand_id_set) == 1: reg_bname, reg_bid = rule_opt_lst[0] else: reg_bname, reg_bid = self._multi_brand_opt(rule_opt_lst) if reg_bid in self.cat3_ori_brandId_name_dict: r_ori_name = self.cat3_ori_brandId_name_dict[reg_bid] elif reg_bid in self.cat2_ori_brandId_name_dict: r_ori_name = self.cat2_ori_brandId_name_dict[reg_bid] else: r_ori_name = '' return reg_bid, reg_bname, r_ori_name except Exception as e: raise e