def brand_reg(self, s_name, brand_lst, brand_id): r_lst = [] for ext_bname in brand_lst: if tool.is_all_eng(ext_bname) and tool.is_own_eng(s_name) and ext_bname in s_name: en_reg_bname = self.english_brand_recognition(ext_bname, s_name) if en_reg_bname != None: reg_bname = en_reg_bname else: continue elif ext_bname in s_name: reg_bname = ext_bname else: continue r_lst.append((reg_bname, brand_id)) #rule_opt_lst = self.rule_opt(s_name, r_lst) #return rule_opt_lst return r_lst
def brand_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 4: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, cat2_id, cat2_name = lst1 # **import cat1_id, cat1_name = cat2_id, cat2_name # s_name小写字母 s_name = tool.s_name_dealing(ori_product_name) if self.brand_rule_obj.no_brand_word_func(s_name): return None, None, None, None, None, None, None # 删除 s_name = self.brand_rule_obj.product_name_del_word_func(s_name) fixed_flag, r_bid = self.brand_rule_obj.fixed_point_func( product_id) if fixed_flag: if r_bid == None: return None, None, None, None, None, None, None else: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( r_bid) return r_bid, self.name_ori_brand_dict[r_bid], "", \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name else: pass clean_brand_2_idx = {} clean_brand_lst = [] # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: for tmp_bid in same_brand_id_lst: clean_brand_lst.append(b_str) clean_brand_2_idx[b_str] = tmp_bid elif b in s_name: for tmp_bid in same_brand_id_lst: clean_brand_lst.append(b_str) clean_brand_2_idx[b_str] = tmp_bid else: continue # 去除已识别品牌中的包含关系 clean_brand_lst = self.brand_inclusion_relation_dealing( clean_brand_lst) clean_idx_2_brand = {} for xxx in clean_brand_lst: x_id = clean_brand_2_idx[xxx] if x_id in clean_brand_2_idx: zzz = clean_idx_2_brand[x_id] clean_idx_2_brand[x_id] = zzz + [xxx] else: clean_idx_2_brand[x_id] = [xxx] if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \ ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: pre_brand_id = clean_brand_id_lst[0] pre_brand = self.name_ori_brand_dict[pre_brand_id] match_type = "匹配到唯一标准品牌" else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: #上方的选择一级类下高GMV品牌策略失效,没有匹配到相同一级类的品牌,则走如下逻辑 pre_brand_id, pre_brand, match_type = self.same_length_strategy( tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id, brand_cat1_name = None, None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e
def brand_same_cat1_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1 # s_name小写字母 s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word)) s_name = self.product_name_del_word(s_name) clean_brand_lst = [] clean_brand_id_lst = [] # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst elif b in s_name: clean_brand_lst.append(ori_b_str) clean_brand_id_lst += same_brand_id_lst else: continue clean_brand_lst = list(set(clean_brand_lst)) clean_brand_id_lst = list(set(clean_brand_id_lst)) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break # 兜底处理 if pre_brand == None and match_type == None: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info(pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e
def brand_reg_opt(self, reg_brand_dict, line_str): try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1 # # s_name小写字母 s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word)) s_name = self.product_name_del_word(s_name) clean_idx_2_brand = {} # 英语手机品牌清洗,比如:vivo,oppo等的清洗 englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func( s_name, cat1_id) for epb in englisg_phone_brand_lst: epb_id, epb_name = epb epb_name = epb_name + '|0' if epb_name not in self.brand_idx_dict: continue clean_idx_2_brand[epb_id] = epb_name # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in reg_brand_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: for tmp_bid in same_brand_id_lst: clean_idx_2_brand[tmp_bid] = ori_b_str elif b in s_name: for tmp_bid in same_brand_id_lst: clean_idx_2_brand[tmp_bid] = ori_b_str else: continue clean_brand_lst = list(set(list(clean_idx_2_brand.values()))) clean_brand_id_lst = list(set(list(clean_idx_2_brand.keys()))) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt( clean_brand_id_lst, clean_idx_2_brand, ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: pre_brand_id, pre_brand, match_type = self.same_length_strategy( tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e
def brand_recognition(self, line_str): ''' brand_recognition is updated by gcw in 2020.09.12. ''' try: line = line_str.strip() if line == "": return None, None, None, None, None, None, None lst_z = line.split("\001") if len(lst_z) != 5: return None, None, None, None, None, None, None lst1 = [tmp.strip() for tmp in lst_z] product_id, ori_product_name, _, cat1_id, cat1_name = lst1 # s_name小写字母 s_name = tool.s_name_dealing(ori_product_name) s_name = self.product_name_del_word(s_name) clean_brand_2_idx = {} clean_brand_lst = [] # 英语手机品牌清洗,比如:vivo,oppo等的清洗 englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func( s_name, cat1_id) for epb in englisg_phone_brand_lst: epb_id, epb_name = epb epb_name = epb_name + '|0' if epb_name not in self.brand_idx_dict: continue clean_brand_2_idx[epb_name] = epb_id clean_brand_lst.append(epb_name) # 1、清洗后的品牌 # 2、清洗后的品牌对应的多个品牌编号 for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items(): # 品牌小写字母 b_str = ori_b_str.lower() lst_z = b_str.split('|') if len(lst_z) != 2: continue b, is_eng = lst_z if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name: tmp_b = self.english_brand_recognition(b, s_name) if tmp_b != None: for tmp_bid in same_brand_id_lst: clean_brand_2_idx[ori_b_str] = tmp_bid clean_brand_lst.append(ori_b_str) elif b in s_name: for tmp_bid in same_brand_id_lst: clean_brand_2_idx[ori_b_str] = tmp_bid clean_brand_lst.append(ori_b_str) else: continue # 去除已识别品牌中的包含关系 clean_brand_lst = self.brand_inclusion_relation_dealing( clean_brand_lst) clean_idx_2_brand = {} for xxx in clean_brand_lst: x_id = clean_brand_2_idx[xxx] if x_id in clean_brand_2_idx: zzz = clean_idx_2_brand[x_id] clean_idx_2_brand[x_id] = zzz + [xxx] else: clean_idx_2_brand[x_id] = [xxx] if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None # 人为规则的过滤 clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \ ori_product_name, cat1_id, cat1_name) if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None if len(clean_brand_id_lst) == 1: b_id = clean_brand_id_lst[0] brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id) return b_id, \ self.name_ori_brand_dict[clean_brand_id_lst[0]], \ "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name pre_brand_id, pre_brand, match_type = None, None, None if len(clean_brand_lst) == 1: pre_brand_id, pre_brand, match_type = \ self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst) else: len_brand_dict = {} for tmp in clean_brand_lst: l = len(tmp) if l in len_brand_dict: x = len_brand_dict[l] len_brand_dict[l] = x + [tmp] else: len_brand_dict[l] = [tmp] r_lst = [(k, v) for k, v in len_brand_dict.items()] r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True) for u in r_lst: tmp_l, tmp_clean_brand_lst = u pre_brand_id, pre_brand, match_type = self.same_cat1_strategy( cat1_id, tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand == None or pre_brand_id == None: pre_brand_id, pre_brand, match_type = self.same_length_strategy( tmp_clean_brand_lst, clean_brand_id_lst) if pre_brand != None and match_type != None: break if pre_brand_id != None: brand_cat1_id, brand_cat1_name = self.getting_cat1_info( pre_brand_id) else: brand_cat1_id = None brand_cat1_name = None return pre_brand_id, pre_brand, match_type, \ brand_cat1_id, brand_cat1_name, cat1_id, cat1_name except Exception as e: raise e