Пример #1
0
    def brand_reg(self, s_name, brand_lst, brand_id):
        r_lst = []

        for ext_bname in brand_lst:
            if tool.is_all_eng(ext_bname) and tool.is_own_eng(s_name) and ext_bname in s_name:
                en_reg_bname = self.english_brand_recognition(ext_bname, s_name)
                if en_reg_bname != None:
                    reg_bname = en_reg_bname
                else:
                    continue
            elif ext_bname in s_name:
                reg_bname = ext_bname
            else:
                continue
            r_lst.append((reg_bname, brand_id))
        #rule_opt_lst = self.rule_opt(s_name, r_lst)
        #return rule_opt_lst
        return r_lst
Пример #2
0
    def brand_recognition(self, line_str):
        '''
        brand_recognition is updated by gcw in 2020.09.12.
        '''
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 4: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, cat2_id, cat2_name = lst1
            # **import
            cat1_id, cat1_name = cat2_id, cat2_name

            # s_name小写字母
            s_name = tool.s_name_dealing(ori_product_name)
            if self.brand_rule_obj.no_brand_word_func(s_name):
                return None, None, None, None, None, None, None
            # 删除
            s_name = self.brand_rule_obj.product_name_del_word_func(s_name)

            fixed_flag, r_bid = self.brand_rule_obj.fixed_point_func(
                product_id)
            if fixed_flag:
                if r_bid == None:
                    return None, None, None, None, None, None, None
                else:
                    brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                        r_bid)
                    return r_bid, self.name_ori_brand_dict[r_bid], "", \
                           brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
            else:
                pass

            clean_brand_2_idx = {}
            clean_brand_lst = []
            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items():
                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue
                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        for tmp_bid in same_brand_id_lst:
                            clean_brand_lst.append(b_str)
                            clean_brand_2_idx[b_str] = tmp_bid
                elif b in s_name:
                    for tmp_bid in same_brand_id_lst:
                        clean_brand_lst.append(b_str)
                        clean_brand_2_idx[b_str] = tmp_bid
                else:
                    continue

            # 去除已识别品牌中的包含关系
            clean_brand_lst = self.brand_inclusion_relation_dealing(
                clean_brand_lst)
            clean_idx_2_brand = {}
            for xxx in clean_brand_lst:
                x_id = clean_brand_2_idx[xxx]
                if x_id in clean_brand_2_idx:
                    zzz = clean_idx_2_brand[x_id]
                    clean_idx_2_brand[x_id] = zzz + [xxx]
                else:
                    clean_idx_2_brand[x_id] = [xxx]

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \
                                                                ori_product_name, cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                pre_brand_id = clean_brand_id_lst[0]
                pre_brand = self.name_ori_brand_dict[pre_brand_id]
                match_type = "匹配到唯一标准品牌"
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(
                        cat1_id, tmp_clean_brand_lst, clean_brand_id_lst)
                    if pre_brand == None or pre_brand_id == None:  #上方的选择一级类下高GMV品牌策略失效,没有匹配到相同一级类的品牌,则走如下逻辑
                        pre_brand_id, pre_brand, match_type = self.same_length_strategy(
                            tmp_clean_brand_lst, clean_brand_id_lst)

                    if pre_brand != None and match_type != None:
                        break
            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                    pre_brand_id)
            else:
                brand_cat1_id, brand_cat1_name = None, None

            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

        except Exception as e:
            raise e
Пример #3
0
    def brand_same_cat1_recognition(self, line_str):
        '''
        brand_recognition is updated by gcw in 2020.09.12.
        '''
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 5: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1

            # s_name小写字母
            s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word))
            s_name = self.product_name_del_word(s_name)
            clean_brand_lst = []
            clean_brand_id_lst = []

            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items():

                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue

                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        clean_brand_lst.append(ori_b_str)
                        clean_brand_id_lst += same_brand_id_lst
                elif b in s_name:
                    clean_brand_lst.append(ori_b_str)
                    clean_brand_id_lst += same_brand_id_lst
                else:
                    continue
            clean_brand_lst = list(set(clean_brand_lst))
            clean_brand_id_lst = list(set(clean_brand_id_lst))
            if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, ori_product_name, cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                b_id = clean_brand_id_lst[0]
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id)
                return b_id, \
                       self.name_ori_brand_dict[clean_brand_id_lst[0]], \
                       "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

            pre_brand_id, pre_brand, match_type = None, None, None
            if len(clean_brand_lst) == 1:
                pre_brand_id, pre_brand, match_type = \
                    self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(cat1_id, tmp_clean_brand_lst,
                                                                                  clean_brand_id_lst)
                    if pre_brand != None and match_type != None:
                        break
                # 兜底处理
                if pre_brand == None and match_type == None:
                    pre_brand_id, pre_brand, match_type = \
                        self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)

            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(pre_brand_id)
            else:
                brand_cat1_id = None
                brand_cat1_name = None
            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
        except Exception as e:
            raise e
Пример #4
0
    def brand_reg_opt(self, reg_brand_dict, line_str):
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 5: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1
            #
            # s_name小写字母
            s_name = tool.s_name_dealing("%s %s" %
                                         (ori_product_name, ori_brand_word))
            s_name = self.product_name_del_word(s_name)
            clean_idx_2_brand = {}
            # 英语手机品牌清洗,比如:vivo,oppo等的清洗
            englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func(
                s_name, cat1_id)
            for epb in englisg_phone_brand_lst:
                epb_id, epb_name = epb
                epb_name = epb_name + '|0'
                if epb_name not in self.brand_idx_dict: continue
                clean_idx_2_brand[epb_id] = epb_name

            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in reg_brand_dict.items():
                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue
                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        for tmp_bid in same_brand_id_lst:
                            clean_idx_2_brand[tmp_bid] = ori_b_str
                elif b in s_name:
                    for tmp_bid in same_brand_id_lst:
                        clean_idx_2_brand[tmp_bid] = ori_b_str
                else:
                    continue

            clean_brand_lst = list(set(list(clean_idx_2_brand.values())))
            clean_brand_id_lst = list(set(list(clean_idx_2_brand.keys())))
            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(
                clean_brand_id_lst, clean_idx_2_brand, ori_product_name,
                cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                b_id = clean_brand_id_lst[0]
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id)
                return b_id, \
                       self.name_ori_brand_dict[clean_brand_id_lst[0]], \
                       "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

            pre_brand_id, pre_brand, match_type = None, None, None
            if len(clean_brand_lst) == 1:
                pre_brand_id, pre_brand, match_type = \
                    self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(
                        cat1_id, tmp_clean_brand_lst, clean_brand_id_lst)
                    if pre_brand == None or pre_brand_id == None:
                        pre_brand_id, pre_brand, match_type = self.same_length_strategy(
                            tmp_clean_brand_lst, clean_brand_id_lst)

                    if pre_brand != None and match_type != None:
                        break

            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                    pre_brand_id)
            else:
                brand_cat1_id = None
                brand_cat1_name = None

            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
        except Exception as e:
            raise e
Пример #5
0
    def brand_recognition(self, line_str):
        '''
        brand_recognition is updated by gcw in 2020.09.12.
        '''
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 5: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, _, cat1_id, cat1_name = lst1

            # s_name小写字母
            s_name = tool.s_name_dealing(ori_product_name)
            s_name = self.product_name_del_word(s_name)
            clean_brand_2_idx = {}
            clean_brand_lst = []
            # 英语手机品牌清洗,比如:vivo,oppo等的清洗
            englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func(
                s_name, cat1_id)
            for epb in englisg_phone_brand_lst:
                epb_id, epb_name = epb
                epb_name = epb_name + '|0'
                if epb_name not in self.brand_idx_dict: continue
                clean_brand_2_idx[epb_name] = epb_id
                clean_brand_lst.append(epb_name)

            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items():
                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue
                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        for tmp_bid in same_brand_id_lst:
                            clean_brand_2_idx[ori_b_str] = tmp_bid
                            clean_brand_lst.append(ori_b_str)
                elif b in s_name:
                    for tmp_bid in same_brand_id_lst:
                        clean_brand_2_idx[ori_b_str] = tmp_bid
                        clean_brand_lst.append(ori_b_str)
                else:
                    continue
            # 去除已识别品牌中的包含关系
            clean_brand_lst = self.brand_inclusion_relation_dealing(
                clean_brand_lst)
            clean_idx_2_brand = {}
            for xxx in clean_brand_lst:
                x_id = clean_brand_2_idx[xxx]
                if x_id in clean_brand_2_idx:
                    zzz = clean_idx_2_brand[x_id]
                    clean_idx_2_brand[x_id] = zzz + [xxx]
                else:
                    clean_idx_2_brand[x_id] = [xxx]

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \
                                                                ori_product_name, cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                b_id = clean_brand_id_lst[0]
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id)
                return b_id, \
                       self.name_ori_brand_dict[clean_brand_id_lst[0]], \
                       "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

            pre_brand_id, pre_brand, match_type = None, None, None
            if len(clean_brand_lst) == 1:
                pre_brand_id, pre_brand, match_type = \
                    self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(
                        cat1_id, tmp_clean_brand_lst, clean_brand_id_lst)
                    if pre_brand == None or pre_brand_id == None:
                        pre_brand_id, pre_brand, match_type = self.same_length_strategy(
                            tmp_clean_brand_lst, clean_brand_id_lst)

                    if pre_brand != None and match_type != None:
                        break

            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                    pre_brand_id)
            else:
                brand_cat1_id = None
                brand_cat1_name = None
            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
        except Exception as e:
            raise e