Пример #1
0
    def inc_data_brand_reg_(self):
        if not os.path.exists(self._inc_p):
            raise Exception("%s does not exist!" % self._inc_p)
        try:
            self.logger.info("reading data %s" % self._inc_p)
            inc_fn = open(self._inc_p)
            idx = 0
            inc_data_lst = []
            for line in inc_fn:
                idx += 1
                if idx % 100000 == 0: self.logger.info(idx)
                line = line.strip()
                if line == "": continue

                lst1 = line.split("\t")
                if len(lst1) != 2:
                    self.logger.info("inc_data_brand_reg error data: %s" %
                                     line)
                    continue
                sid, ori_name = lst1
                sid = sid.strip()  #获得取到的数据并把数据存储到一个list里面

                if sid in self._old_sid_dict:
                    continue  #此处的sid为增量的数据,old_sid_dict为原始的数据,此处做增量清洗,发现增量的数据在老数据中都能找到的话就不做这一步的清洗,直接跳过
                s_name = tool.s_name_dealing(
                    ori_name)  #通过oriname获取到一种s_name,主要是去除杂质预处理
                s_tmp = "\x01".join([sid, ori_name, s_name])  #将一条数据信息拼接成一个
                r = self.brand_reg.brand_rewrite(s_tmp)
                if r is None:
                    self.logger.error("brand-reg error: %s" % lst1)
                    continue
                inc_data_lst.append(r)
            self.logger.info("inc_data len: %s" % len(inc_data_lst))
            inc_fn.close()

            with open(self._output_p + "_inc_data_tmp",
                      'w') as f3:  #该目录只会记录增量的清洗结果数据
                f3.write("\n".join(inc_data_lst))
                f3.flush()

            tmp_lst = self._old_data + inc_data_lst
            self.logger.info("total data: %s" % len(tmp_lst))
            with open(self._output_p, 'w') as f3:  #该目录记录清洗的所有数据
                f3.write("\n".join(tmp_lst))
                f3.flush()
            # 小时级别的备份
            self.logger.info(
                "changing base_file: dp_brand_result.txt.brandreg")
            time_str = time.strftime("%y-%m-%d_%H")
            self.logger.info("%s -> %s" %
                             (self._ori_p, self._ori_p + "_" + time_str))
            shutil.copyfile(self._ori_p,
                            self._ori_p + "_" + time_str)  #ori_data的时间备份
            self.logger.info("%s -> %s" % (self._output_p, self._ori_p))
            shutil.copyfile(
                self._output_p,
                self._ori_p)  #此处是存储的结果,后续的结果也是通过该文件读取后存入数据库,将写的结果更新了ori_data

        except Exception as e:
            raise e
    def brand_rewrite(self, line):
        line = line.strip()
        if line == "":
            self.logger.info("empty string!!")
            return None
        lst1 = line.split("\x01")
        if len(lst1) == 3:
            s_id, ori_name, s_brand = lst1  #取到相关的数据
            s_brand = s_brand.strip()
        else:
            self.logger.info("brand_rewrite error data: %s" % line)
            return None

        s_name = tool.s_name_dealing(ori_name)
        if len(self.real_brand_set) > 0:
            if s_brand not in self.real_brand_set:
                ex_brand = self._real_brand_reg(s_name)     #匹配过程。如果取到的数据当中没有在数据集中找到相同的品牌,则对这种数据处理一下,在一个数据集中去匹配,进行品牌的归并
                tmp_brand = ex_brand if ex_brand != None else s_brand   #如果对处理过的品牌就赋值给tmp_brand,否则直接赋值
            else:
                tmp_brand = s_brand     #如果在数据集中找到了直接赋值
        else:
            tmp_brand = s_brand     #如果没有数据集就直接赋值
        # brand 修正
        r_brand = self._brand_exchange(tmp_brand)
        # 错误品牌检测
        if r_brand in self.del_brand_dict:
            r_brand = s_name

        return "\x01".join([s_id, ori_name, r_brand])   #拼接后返回结果
Пример #3
0
    def deal_line_data(self, data_single):
        idx = 0
        inc_data_lst = []
        print(len(data_single))
        for line in data_single:
            idx += 1
            if idx % 50000 == 0: self.logger.info(idx)
            line = line.strip()
            if line == "": continue

            lst1 = line.split("\t")
            if len(lst1) != 2:
                self.logger.info("inc_data_brand_reg error data: %s" % line)
                continue
            sid, ori_name = lst1
            sid = sid.strip()  # 获得取到的数据并把数据存储到一个list里面

            if sid in self._old_sid_dict:
                continue  # 此处的sid为增量的数据,old_sid_dict为原始的数据,此处做增量清洗,发现增量的数据在老数据中都能找到的话就不做这一步的清洗,直接跳过
            s_name = tool.s_name_dealing(
                ori_name)  # 通过ori_name获取到一种s_name,主要是去除杂质预处理
            s_tmp = "\x01".join([sid, ori_name, s_name])  # 将一条数据信息拼接成一个
            r = self.brand_reg.brand_rewrite(s_tmp)
            if r is None:
                self.logger.error("brand-reg error: %s" % lst1)
                continue
            inc_data_lst.append(r)
        return inc_data_lst
Пример #4
0
    def inc_data_brand_reg(self):
        if not os.path.exists(self._inc_p):
            raise Exception("%s does not exist!" % self._inc_p)
        try:
            inc_fn = open(self._inc_p)
            idx = 0
            inc_data_lst = []
            for line in inc_fn:
                idx += 1
                if idx % 100000 == 0: self.logger.info(idx)
                line = line.strip()
                if line == "": continue

                lst1 = line.split("\t")
                if len(lst1) != 2:
                    self.logger.info("inc_data_brand_reg error data: %s" %
                                     line)
                    continue
                sid, ori_name = lst1
                sid = sid.strip()

                if sid in self._old_sid_dict: continue
                s_name = tool.s_name_dealing(ori_name)
                tmp_brand = self._brand_reg(
                    s_name)  #通过之前二级分类读取到的legal_brand清洗s_name,s_name的来源是增量数据
                if tmp_brand is None:
                    self.logger.error("brand-reg error: %s" % lst1)
                    continue
                r = "\x01".join([sid, ori_name, tmp_brand])
                inc_data_lst.append(r)  #拼接成一个字符串存入list

            self.logger.info("inc_data len: %s" % len(inc_data_lst))
            inc_fn.close()

            with open(self._output_p + "_inc_data_tmp", 'w') as f3:
                f3.write("\n".join(inc_data_lst))  #写入文件
                f3.flush()

            tmp_lst = self._old_data + inc_data_lst  #拼接增数据和老数据,然后写入文件
            self.logger.info("total data: %s" % len(tmp_lst))
            with open(self._output_p, 'w') as f3:
                f3.write("\n".join(tmp_lst))
                f3.flush()

            self.logger.info(
                "liwei-method changing base_file: dp_brand_result.txt.brandreg"
            )
            time_str = time.strftime("%y-%m-%d_%H")
            shutil.copyfile(self._ori_p, self._ori_p + "_" + time_str)
            shutil.copyfile(self._output_p, self._ori_p)

            return 0
        except Exception as e:
            raise e
Пример #5
0
    def brand_recognition(self, line_str):
        '''
        brand_recognition is updated by gcw in 2020.09.12.
        '''
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 4: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, cat2_id, cat2_name = lst1
            # **import
            cat1_id, cat1_name = cat2_id, cat2_name

            # s_name小写字母
            s_name = tool.s_name_dealing(ori_product_name)
            if self.brand_rule_obj.no_brand_word_func(s_name):
                return None, None, None, None, None, None, None
            # 删除
            s_name = self.brand_rule_obj.product_name_del_word_func(s_name)

            fixed_flag, r_bid = self.brand_rule_obj.fixed_point_func(
                product_id)
            if fixed_flag:
                if r_bid == None:
                    return None, None, None, None, None, None, None
                else:
                    brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                        r_bid)
                    return r_bid, self.name_ori_brand_dict[r_bid], "", \
                           brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
            else:
                pass

            clean_brand_2_idx = {}
            clean_brand_lst = []
            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items():
                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue
                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        for tmp_bid in same_brand_id_lst:
                            clean_brand_lst.append(b_str)
                            clean_brand_2_idx[b_str] = tmp_bid
                elif b in s_name:
                    for tmp_bid in same_brand_id_lst:
                        clean_brand_lst.append(b_str)
                        clean_brand_2_idx[b_str] = tmp_bid
                else:
                    continue

            # 去除已识别品牌中的包含关系
            clean_brand_lst = self.brand_inclusion_relation_dealing(
                clean_brand_lst)
            clean_idx_2_brand = {}
            for xxx in clean_brand_lst:
                x_id = clean_brand_2_idx[xxx]
                if x_id in clean_brand_2_idx:
                    zzz = clean_idx_2_brand[x_id]
                    clean_idx_2_brand[x_id] = zzz + [xxx]
                else:
                    clean_idx_2_brand[x_id] = [xxx]

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \
                                                                ori_product_name, cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                pre_brand_id = clean_brand_id_lst[0]
                pre_brand = self.name_ori_brand_dict[pre_brand_id]
                match_type = "匹配到唯一标准品牌"
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(
                        cat1_id, tmp_clean_brand_lst, clean_brand_id_lst)
                    if pre_brand == None or pre_brand_id == None:  #上方的选择一级类下高GMV品牌策略失效,没有匹配到相同一级类的品牌,则走如下逻辑
                        pre_brand_id, pre_brand, match_type = self.same_length_strategy(
                            tmp_clean_brand_lst, clean_brand_id_lst)

                    if pre_brand != None and match_type != None:
                        break
            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                    pre_brand_id)
            else:
                brand_cat1_id, brand_cat1_name = None, None

            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

        except Exception as e:
            raise e
Пример #6
0
    def brand_same_cat1_recognition(self, line_str):
        '''
        brand_recognition is updated by gcw in 2020.09.12.
        '''
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 5: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1

            # s_name小写字母
            s_name = tool.s_name_dealing("%s %s" % (ori_product_name, ori_brand_word))
            s_name = self.product_name_del_word(s_name)
            clean_brand_lst = []
            clean_brand_id_lst = []

            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items():

                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue

                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        clean_brand_lst.append(ori_b_str)
                        clean_brand_id_lst += same_brand_id_lst
                elif b in s_name:
                    clean_brand_lst.append(ori_b_str)
                    clean_brand_id_lst += same_brand_id_lst
                else:
                    continue
            clean_brand_lst = list(set(clean_brand_lst))
            clean_brand_id_lst = list(set(clean_brand_id_lst))
            if len(clean_brand_lst) == 0: return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, ori_product_name, cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                b_id = clean_brand_id_lst[0]
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id)
                return b_id, \
                       self.name_ori_brand_dict[clean_brand_id_lst[0]], \
                       "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

            pre_brand_id, pre_brand, match_type = None, None, None
            if len(clean_brand_lst) == 1:
                pre_brand_id, pre_brand, match_type = \
                    self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(cat1_id, tmp_clean_brand_lst,
                                                                                  clean_brand_id_lst)
                    if pre_brand != None and match_type != None:
                        break
                # 兜底处理
                if pre_brand == None and match_type == None:
                    pre_brand_id, pre_brand, match_type = \
                        self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)

            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(pre_brand_id)
            else:
                brand_cat1_id = None
                brand_cat1_name = None
            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
        except Exception as e:
            raise e
Пример #7
0
    def brand_reg_opt(self, reg_brand_dict, line_str):
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 5: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, ori_brand_word, cat1_id, cat1_name = lst1
            #
            # s_name小写字母
            s_name = tool.s_name_dealing("%s %s" %
                                         (ori_product_name, ori_brand_word))
            s_name = self.product_name_del_word(s_name)
            clean_idx_2_brand = {}
            # 英语手机品牌清洗,比如:vivo,oppo等的清洗
            englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func(
                s_name, cat1_id)
            for epb in englisg_phone_brand_lst:
                epb_id, epb_name = epb
                epb_name = epb_name + '|0'
                if epb_name not in self.brand_idx_dict: continue
                clean_idx_2_brand[epb_id] = epb_name

            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in reg_brand_dict.items():
                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue
                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        for tmp_bid in same_brand_id_lst:
                            clean_idx_2_brand[tmp_bid] = ori_b_str
                elif b in s_name:
                    for tmp_bid in same_brand_id_lst:
                        clean_idx_2_brand[tmp_bid] = ori_b_str
                else:
                    continue

            clean_brand_lst = list(set(list(clean_idx_2_brand.values())))
            clean_brand_id_lst = list(set(list(clean_idx_2_brand.keys())))
            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(
                clean_brand_id_lst, clean_idx_2_brand, ori_product_name,
                cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                b_id = clean_brand_id_lst[0]
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id)
                return b_id, \
                       self.name_ori_brand_dict[clean_brand_id_lst[0]], \
                       "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

            pre_brand_id, pre_brand, match_type = None, None, None
            if len(clean_brand_lst) == 1:
                pre_brand_id, pre_brand, match_type = \
                    self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(
                        cat1_id, tmp_clean_brand_lst, clean_brand_id_lst)
                    if pre_brand == None or pre_brand_id == None:
                        pre_brand_id, pre_brand, match_type = self.same_length_strategy(
                            tmp_clean_brand_lst, clean_brand_id_lst)

                    if pre_brand != None and match_type != None:
                        break

            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                    pre_brand_id)
            else:
                brand_cat1_id = None
                brand_cat1_name = None

            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
        except Exception as e:
            raise e
Пример #8
0
    def brand_recognition(self, line_str):
        '''
        brand_recognition is updated by gcw in 2020.09.12.
        '''
        try:
            line = line_str.strip()
            if line == "": return None, None, None, None, None, None, None
            lst_z = line.split("\001")
            if len(lst_z) != 5: return None, None, None, None, None, None, None
            lst1 = [tmp.strip() for tmp in lst_z]
            product_id, ori_product_name, _, cat1_id, cat1_name = lst1

            # s_name小写字母
            s_name = tool.s_name_dealing(ori_product_name)
            s_name = self.product_name_del_word(s_name)
            clean_brand_2_idx = {}
            clean_brand_lst = []
            # 英语手机品牌清洗,比如:vivo,oppo等的清洗
            englisg_phone_brand_lst = self.brand_rule_obj.english_phone_rule_func(
                s_name, cat1_id)
            for epb in englisg_phone_brand_lst:
                epb_id, epb_name = epb
                epb_name = epb_name + '|0'
                if epb_name not in self.brand_idx_dict: continue
                clean_brand_2_idx[epb_name] = epb_id
                clean_brand_lst.append(epb_name)

            # 1、清洗后的品牌
            # 2、清洗后的品牌对应的多个品牌编号
            for ori_b_str, same_brand_id_lst in self.brand_idx_dict.items():
                # 品牌小写字母
                b_str = ori_b_str.lower()
                lst_z = b_str.split('|')
                if len(lst_z) != 2: continue
                b, is_eng = lst_z
                if is_eng == "0" and tool.is_own_eng(s_name) and b in s_name:
                    tmp_b = self.english_brand_recognition(b, s_name)
                    if tmp_b != None:
                        for tmp_bid in same_brand_id_lst:
                            clean_brand_2_idx[ori_b_str] = tmp_bid
                            clean_brand_lst.append(ori_b_str)
                elif b in s_name:
                    for tmp_bid in same_brand_id_lst:
                        clean_brand_2_idx[ori_b_str] = tmp_bid
                        clean_brand_lst.append(ori_b_str)
                else:
                    continue
            # 去除已识别品牌中的包含关系
            clean_brand_lst = self.brand_inclusion_relation_dealing(
                clean_brand_lst)
            clean_idx_2_brand = {}
            for xxx in clean_brand_lst:
                x_id = clean_brand_2_idx[xxx]
                if x_id in clean_brand_2_idx:
                    zzz = clean_idx_2_brand[x_id]
                    clean_idx_2_brand[x_id] = zzz + [xxx]
                else:
                    clean_idx_2_brand[x_id] = [xxx]

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None
            # 人为规则的过滤
            clean_brand_lst, clean_brand_id_lst = self.rule_opt(clean_brand_lst, clean_idx_2_brand, \
                                                                ori_product_name, cat1_id, cat1_name)

            if len(clean_brand_lst) == 0:
                return None, None, "没有匹配到标准品牌", None, None, None, None

            if len(clean_brand_id_lst) == 1:
                b_id = clean_brand_id_lst[0]
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(b_id)
                return b_id, \
                       self.name_ori_brand_dict[clean_brand_id_lst[0]], \
                       "匹配到唯一标准品牌", brand_cat1_id, brand_cat1_name, cat1_id, cat1_name

            pre_brand_id, pre_brand, match_type = None, None, None
            if len(clean_brand_lst) == 1:
                pre_brand_id, pre_brand, match_type = \
                    self.same_cat1_gmv_strategy(cat1_id, clean_brand_lst, clean_brand_id_lst)
            else:
                len_brand_dict = {}
                for tmp in clean_brand_lst:
                    l = len(tmp)
                    if l in len_brand_dict:
                        x = len_brand_dict[l]
                        len_brand_dict[l] = x + [tmp]
                    else:
                        len_brand_dict[l] = [tmp]

                r_lst = [(k, v) for k, v in len_brand_dict.items()]
                r_lst = sorted(r_lst, key=lambda m: m[0], reverse=True)
                for u in r_lst:
                    tmp_l, tmp_clean_brand_lst = u
                    pre_brand_id, pre_brand, match_type = self.same_cat1_strategy(
                        cat1_id, tmp_clean_brand_lst, clean_brand_id_lst)
                    if pre_brand == None or pre_brand_id == None:
                        pre_brand_id, pre_brand, match_type = self.same_length_strategy(
                            tmp_clean_brand_lst, clean_brand_id_lst)

                    if pre_brand != None and match_type != None:
                        break

            if pre_brand_id != None:
                brand_cat1_id, brand_cat1_name = self.getting_cat1_info(
                    pre_brand_id)
            else:
                brand_cat1_id = None
                brand_cat1_name = None
            return pre_brand_id, pre_brand, match_type, \
                   brand_cat1_id, brand_cat1_name, cat1_id, cat1_name
        except Exception as e:
            raise e
Пример #9
0
    def brand_reg_main(self, ori_product_name, cat1_name, cat2_name, cat3_name):
        try:
            ori_product_name, cat3_name = ori_product_name.strip(), cat3_name.strip()
            if ori_product_name == "":
                raise Exception("ori_product_name is empty!")
            if cat3_name == '':
                raise Exception("cat3_name is empty!")
            if cat3_name not in self.cat3_to_brandId_dict:
                raise Exception("%s is not in self.cat3_to_brandId_dict" % cat3_name)
            cat3_ext_name = {}
            for bid in self.cat3_to_brandId_dict[cat3_name]:
                if bid not in self.cat3_ori_brandId_name_dict or \
                        bid not in self.cat3_ext_brandId_name_dict: continue
                cat3_ext_name[bid] = self._ext_name_sorted(self.cat3_ext_brandId_name_dict[bid])

            s_name = tool.s_name_dealing(ori_product_name)
            # 删除
            s_name = self.brand_rule_obj.product_name_del_word_func(s_name)

            # "dirty words" in product name
            if self.brand_rule_obj.no_brand_word_func(s_name): return None, None, None
            # 三级类目品牌识别
            reg_bname_lst = []
            for k, v in cat3_ext_name.items():
                reg_bname_lst += self.brand_reg(s_name, v, k)

            # 二级类目品牌识别
            if len(reg_bname_lst) == 0 and self.is_cat2_brand_reg:
                cat2_ext_name = {}
                for bid in self.cat2_to_brandId_dict[cat2_name]:
                    if bid not in self.cat2_ori_brandId_name_dict or \
                            bid not in self.cat2_ext_brandId_name_dict: continue
                    cat2_ext_name[bid] = self._ext_name_sorted(self.cat2_ext_brandId_name_dict[bid])

                for k, v in cat2_ext_name.items():
                    reg_bname_lst += self.brand_reg(s_name, v, k)
            # 一级类目品牌识别
            if len(reg_bname_lst) == 0 and self.is_cat1_brand_reg:
                cat1_ext_name = {}
                for bid in self.cat1_to_brandId_dict[cat1_name]:
                    if bid not in self.cat1_ori_brandId_name_dict or \
                            bid not in self.cat1_ext_brandId_name_dict: continue
                    cat1_ext_name[bid] = self._ext_name_sorted(self.cat1_ext_brandId_name_dict[bid])

                for k, v in cat1_ext_name.items():
                    reg_bname_lst += self.brand_reg(s_name, v, k)


            if len(reg_bname_lst) == 0: return None, None, None

            rule_opt_lst = self.rule_opt(s_name, reg_bname_lst, cat3_name)
            brand_id_set = set()
            for zz in rule_opt_lst:
                brand_id_set.add(zz[1])

            if len(brand_id_set) == 0: return None, None, None
            if len(brand_id_set) == 1:
                reg_bname, reg_bid = rule_opt_lst[0]
            else:
                reg_bname, reg_bid = self._multi_brand_opt(rule_opt_lst)

            if reg_bid in self.cat3_ori_brandId_name_dict:
                r_ori_name = self.cat3_ori_brandId_name_dict[reg_bid]
            elif reg_bid in self.cat2_ori_brandId_name_dict:
                r_ori_name = self.cat2_ori_brandId_name_dict[reg_bid]
            else:
                r_ori_name = ''
            return reg_bid, reg_bname, r_ori_name

        except Exception as e:
            raise e
Пример #10
0
#encon
from brand_reg_tool import BrandRegTool
import tool
import traceback
import configparser

traceback.format_exc()
# def __init__(self, standard_brand_file, del_brand_file=None, exchange_brand_file=None, rule_brand_file=None):
#
bReg = BrandRegTool("brand_recall_info.txt", "del_brand_info.txt",
                    "exchange_brand_info.txt", "rule_brand.cfg")

s1 = " 行营"
s2 = tool.s_name_dealing('(买一送一)手工精绑主线组')
s3 = tool.s_name_dealing("行营")
# print(tool.s_name_dealing(s2 + " " + s3))

# print(bReg.brand_idx_dict)

idx = 0
err_lst = []
flag = 0


def write_config(new_test, ori_config):
    config = configparser.ConfigParser()
    config.read(ori_config, encoding="utf-8")
    sec = config.sections()

    if new_test[0] not in sec:
        config.add_section(new_test[0])