示例#1
0
 def __init__(self, corpus_file, outfile):
     if not os.path.exists(corpus_file):
         raise FileNotFoundError
     self.clean_doc_obj = CleanDoc(stopwords_file="./data/stopwords.txt")
     self.corpus_file = corpus_file
     self.outfile = outfile
     self.get_train_format_file()
示例#2
0
 def __init__(self, file, outfile):
     self.file = file
     self.outfile = outfile
     self.clean_doc_obj = CleanDoc(stopwords_file="./data/stopwords.txt")
     file_path, file_extend = os.path.splitext(outfile)
     self.corpus_file = "{}_corpus{}".format(file_path, file_extend)
     self.error_file = "{}_error{}".format(file_path, file_extend)
     self.get_clean_data()
示例#3
0
 def __init__(self, corpus_file, out_dir):
     if not os.path.exists(corpus_file):
         raise FileNotFoundError
     self.clean_doc_obj = CleanDoc()
     self.corpus_file = corpus_file
     self.train_file = os.path.join(out_dir, "train.txt")
     self.dev_file = os.path.join(out_dir, "validation.txt")
     self.test_file = os.path.join(out_dir, "test.txt")
     # self.analysis_label_dist()
     self.get_category_corpus_file()
示例#4
0
def clean_text_demo():
    title = "搞笑GIF:聪明人一看就知道怎么做到的!"
    content = "媳妇我不练了,不练了! 哎呀,悲催了 大哥,看着点路 一个月了原来我的篮球在这里? 不懂艺术的人们 怪我喽? 白猫:你干嘛,干嘛,干嘛…… 真是人间美味啊! 妹子,还以为你衣服穿反了 你会手指打结吗? 美女,你这喉结过分了啊 聪明人一看就知道怎么做到的! 面这两人也是躺着中枪 辣条都让你们玩涨价了! 妹子砸车的威武霸气,真是惹不起 这手势太吓人了,我要下车! 妹子都进电梯了,你还拉她出来干啥 妹子这么勤快呢,桌子很干净了,休息会吧 看得出,教练确实很受打击 去车展的有几个是卖车的,我感觉更多的人是看美女的啊! 讲道理,现在什么都是智能,连沙袋都变成智能的了 都捏不碎,老丈人表示不服,非要亲自示范一下 跟老师们一起吃饭,感觉很开心! 爱她就背起她走回家,妥妥的真爱呐 对女孩子来说,头发就是的她的命 长不大的妹子,连鬼你都戏弄 好玩 大哥你配合得太专业了啊 知道篮球场汤为什么总是会有这么多大神呢?你这里比较有市场需求 兄弟,现在的日子过的是越来越好了啊 啊!我的眼睛"
    clean_doc_obj = CleanDoc(stopwords_file="./data/stopwords.txt")
    c_title, title_char, title_token = clean_doc_obj.get_char_and_token_feature(
        title)
    print(c_title)
    print(title_char)
    c_content, content_char, content_token = clean_doc_obj.get_char_and_token_feature(
        content)
    print(c_content)
    print(content_token)
示例#5
0
class PrePublicCorpus(object):
    """
    将公开数据集处理为模型训练数据
    """
    def __init__(self, corpus_file, out_dir):
        if not os.path.exists(corpus_file):
            raise FileNotFoundError
        self.clean_doc_obj = CleanDoc()
        self.corpus_file = corpus_file
        self.train_file = os.path.join(out_dir, "train.txt")
        self.dev_file = os.path.join(out_dir, "validation.txt")
        self.test_file = os.path.join(out_dir, "test.txt")
        # self.analysis_label_dist()
        self.get_category_corpus_file()

    def get_category_corpus_file(self):
        print(">>>>> preprocess corpus")
        X, Y = self.get_clean_data(category_level=1)
        train, dev, test = self.stratified_sampling(X, Y, 0.2)
        self.write_txt_file(train, self.train_file)
        self.write_txt_file(dev, self.dev_file)
        self.write_txt_file(test, self.test_file)

    def get_clean_data(self, category_level=1):
        X = list()
        Y = list()
        _count = dict()
        for line in self.read_json_format_file(self.corpus_file):
            if line:
                (_id, channel, title, content), y = self._preline(line)
                label = y[category_level - 1]

                _, title_char_feature, title_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                    title)
                _, content_char_feature, content_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                    content)

                # 获取title和content的char特征
                char_feature = " ".join(
                    list(title_char_feature)) + " ### " + " ".join(
                        list(content_char_feature))
                # 获取title和content的token特征
                token_feature = " ".join(
                    list(title_token_feature)) + " ### " + " ".join(
                        list(content_token_feature))

                if len(title) > 10:
                    if label in _count.keys():
                        if _count[label] > 10000:
                            continue
                        else:
                            _count[label] += 1
                            X.append(token_feature + "\t" + char_feature +
                                     "\t" + channel)
                            Y.append(label)
                    else:
                        _count[label] = 1
                        X.append(token_feature + "\t" + char_feature + "\t" +
                                 channel)
                        Y.append(label)
                else:
                    continue
        return X, Y

    def stratified_sampling(self, x, y, valid_portion, validation=True):
        """
        按标签类别个数分层切分训练集,验证集,测试集
        :param x:
        :param y:
        :param valid_portion:
        :return:
        """
        skf = StratifiedKFold(n_splits=int(1 / valid_portion))
        train = None
        dev = None
        test = None

        index = [(train_index, dev_test_index)
                 for train_index, dev_test_index in skf.split(x, y)]

        train_label_count = self._label_count([y[i] for i in index[0][0]])
        print("train_label_count: {}".format(
            json.dumps(train_label_count, indent=4, ensure_ascii=False)))
        train = [y[i] + "\t" + x[i] for i in index[0][0]]
        if validation:
            dev_test_skf = StratifiedKFold(n_splits=2)
            dev_test_x = [x[j] for j in index[0][1]]
            dev_test_y = [y[j] for j in index[0][1]]
            dev_test_index = [(dev_index, test_index)
                              for dev_index, test_index in dev_test_skf.split(
                                  dev_test_x, dev_test_y)]
            validation_label_count = self._label_count(
                [y[i] for i in dev_test_index[0][0]])
            print("validation_label_count: {}".format(
                json.dumps(validation_label_count,
                           indent=4,
                           ensure_ascii=False)))
            test_label_count = self._label_count(
                [y[j] for j in dev_test_index[0][1]])
            print("test_label_count: {}".format(
                json.dumps(test_label_count, indent=4, ensure_ascii=False)))
            dev = [
                dev_test_y[j] + "\t" + dev_test_x[j]
                for j in dev_test_index[0][0]
            ]
            test = [
                dev_test_y[j] + "\t" + dev_test_x[j]
                for j in dev_test_index[0][0]
            ]
        else:
            dev = [y[j] + "\t" + x[j] for j in index[0][1]]
            test = dev

        return train, dev, test

    def _label_dist_count(self,
                          level1=None,
                          level2=None,
                          level3=None,
                          dist_count=None):
        """
        统计标签分布计算
        :param level1:一级标签
        :param level2:二级标签
        :param level3:三级标签
        :param dist_count:标签分布字典
        :return:
        """
        if level1:
            if level1 in dist_count:
                dist_count[level1]["count"] += 1
            else:
                dist_count[level1] = dict()
                dist_count[level1]["count"] = 1
            if level2:
                if level2 in dist_count[level1]:
                    dist_count[level1][level2]["count"] += 1
                else:
                    dist_count[level1][level2] = dict()
                    dist_count[level1][level2]["count"] = 1
                if level3:
                    if level3 in dist_count[level1][level2]:
                        dist_count[level1][level2][level3] += 1
                    else:
                        dist_count[level1][level2][level3] = 1

    @staticmethod
    def dict_sort(result, limit_num=None):
        """
        字典排序, 返回有序字典
        :param result:
        :param limit_num:
        :return:
        """
        _result_sort = sorted(result.items(),
                              key=lambda x: x[1]["count"],
                              reverse=True)
        result_sort = OrderedDict()

        count_limit = 0
        domain_count = 0
        for i in _result_sort:
            if limit_num:
                if i[1] > limit_num:
                    result_sort[i[0]] = i[1]
                    domain_count += 1
                    count_limit += i[1]
            else:
                result_sort[i[0]] = i[1]
        return result_sort

    @staticmethod
    def _preline(line):
        """
        处理文件行(dict格式)
        """
        article_id = ""
        channel = "thuc_news"
        title = line.get("text", "")
        content = ""
        top_category = line.get("label", "")
        sub_category = ""
        third_category = ""
        return (article_id, channel, title,
                content), (top_category, sub_category, third_category)

    def _label_count(self, label_list):
        label_count = dict()
        for i in label_list:
            if label_count.get(i, None) is not None:
                label_count[i] += 1
            else:
                label_count[i] = 1
        return label_count

    @staticmethod
    def read_json_format_file(file):
        """
        读取每行为json格式的文本
        :param file: 文件名
        :return: 每行文本
        """
        if not os.path.exists(file):
            raise FileNotFoundError("file {} not found.".format(file))
        print(">>>>> reading file:{}".format(file))
        line_count = 0
        with open(file, 'r') as f:
            while True:
                _line = f.readline()
                line_count += 1
                if not _line:
                    break
                else:
                    line = json.loads(_line.strip())
                    # line = eval(_line.strip())
                    if line_count % 100000 == 0:
                        print(">>>>> read {} lines.".format(line_count))
                    yield line

    def write_txt_file(self, data, file):
        """
        写数据到文件
        """
        print(">>>>> start writing file")
        with open(file, "w") as f:
            for line in data:
                f.write(line + "\n")
        print("<<<<< write down:{}".format(file))
示例#6
0
class PreTestCorpus(object):
    """
    单独处理点检数据作为测试集
    """
    def __init__(self, corpus_file, outfile):
        if not os.path.exists(corpus_file):
            raise FileNotFoundError
        self.clean_doc_obj = CleanDoc(stopwords_file="./data/stopwords.txt")
        self.corpus_file = corpus_file
        self.outfile = outfile
        self.get_train_format_file()

    def get_train_format_file(self):
        print(">>>>> preprocess corpus")
        data = self.get_train_format_data(category_level=1)
        self.write_txt_file(data, self.outfile)

    def get_train_format_data(self, category_level=1):
        data = list()
        for line in read_json_format_file(self.corpus_file):
            if line:
                (_id, channel, title, content), y = self._preline(line)
                label = y[category_level - 1]
                _, title_char_feature, title_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                    title)
                _, content_char_feature, content_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                    content)

                # 获取title和content的char特征
                char_feature = " ".join(
                    list(title_char_feature)) + " ### " + " ".join(
                        list(content_char_feature))
                # 获取title和content的token特征
                token_feature = " ".join(
                    list(title_token_feature)) + " ### " + " ".join(
                        list(content_token_feature))
                data.append(label + "\t" + token_feature + "\t" +
                            char_feature + "\t" + channel)
            else:
                continue
        return data

    @staticmethod
    def _preline(line):
        """
        处理文件行(dict格式)
        """
        article_id = line.get("id", "")
        channel = ""
        title = line.get("title", "")
        content = line.get("content", "")
        top_category = line.get("manual_tag-3.0", "")
        sub_category = line.get("manual_subtag-3.0", "")
        third_category = ""

        return (article_id, channel, title,
                content), (top_category, sub_category, third_category)

    def write_txt_file(self, data, file):
        """
        写数据到文件
        """
        print(">>>>> start writing file")
        with open(file, "w") as f:
            for line in data:
                f.write(line + "\n")
        print("<<<<< write down:{}".format(file))
示例#7
0
class PreCorpusV1(object):
    """
    根据历史数据版本@刘欢
    进行新的数据清洗过程
    """
    def __init__(self, file, outfile):
        self.file = file
        self.outfile = outfile
        self.clean_doc_obj = CleanDoc(stopwords_file="./data/stopwords.txt")
        file_path, file_extend = os.path.splitext(outfile)
        self.corpus_file = "{}_corpus{}".format(file_path, file_extend)
        self.error_file = "{}_error{}".format(file_path, file_extend)
        self.get_clean_data()

    def get_clean_data(self):
        clean_data = []
        with open(self.file, "r", encoding="utf-8") as f, \
                open(self.error_file, "w", encoding="utf-8") as ef, \
                open(self.corpus_file, "w", encoding="utf-8") as cf:
            lines = f.readlines()
            for line in lines:
                line_str = line.strip().split('\t')
                corpus_line = dict()
                if len(line_str) == 4:
                    label = line_str[0]
                    if "#   #   #" in line_str[1]:
                        text = line_str[1].split("#   #   #")
                    else:
                        text = line_str[1].split("###")
                    if len(text) == 2:
                        _title, _content = text[0], text[1]
                    else:
                        _title, _content = "", text[0]
                    title = "".join(_title.split(" "))
                    content = "".join(_content.split(" "))
                    _, title_char_feature, title_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                        title)
                    _, content_char_feature, content_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                        content)

                    # 获取title和content的char特征
                    char_feature = " ".join(
                        list(title_char_feature)) + " ### " + " ".join(
                            list(content_char_feature))
                    # 获取title和content的token特征
                    token_feature = " ".join(
                        list(title_token_feature)) + " ### " + " ".join(
                            list(content_token_feature))
                    clean_data.append(label + "\t" + token_feature + "\t" +
                                      char_feature + "\t" + line_str[3])
                    corpus_line["title"] = title
                    corpus_line["content"] = content
                    corpus_line["topcategory"] = label
                    cf.write(
                        json.dumps(corpus_line, ensure_ascii=False) + "\n")
                else:
                    ef.write(line.strip() + "\n")
        self.rewrite_txt_file(clean_data, self.outfile)

    def rewrite_txt_file(self, data, file):
        """
        写数据到文件
        """
        print(">>>>> start writing file")
        with open(file, "w") as f:
            for line in data:
                f.write(line + "\n")
        print("<<<<< write down:{}".format(file))
示例#8
0
class PreCorpus(object):
    """
    处理为模型训练数据,包括数据清洗和数据分层
    """
    def __init__(self, corpus_file, out_dir):
        if not os.path.exists(corpus_file):
            raise FileNotFoundError
        self.clean_doc_obj = CleanDoc()
        self.corpus_file = corpus_file
        self.train_file = os.path.join(out_dir, "train.txt")
        self.dev_file = os.path.join(out_dir, "validate.txt")
        self.test_file = os.path.join(out_dir, "test.txt")
        # self.analysis_label_dist()
        self.get_category_corpus_file()

    def analysis_label_dist(self):
        """
        统计label分布
        """
        category_count = dict()
        for line in read_json_format_file(self.corpus_file):
            _, (top_category, _, _) = self._preline(line)
            self._label_dist_count(level1=top_category,
                                   dist_count=category_count)

        sorted_category_count = self.dict_sort(category_count)
        print(json.dumps(sorted_category_count, ensure_ascii=False, indent=4))

    def get_category_corpus_file(self):
        print(">>>>> preprocess corpus")
        X, Y = self.get_clean_data(category_level=1)
        train, dev = self.stratified_sampling(X, Y, 0.2)
        self.write_txt_file(train, self.train_file)
        self.write_txt_file(dev, self.dev_file)
        self.write_txt_file(dev, self.test_file)

    def get_clean_data(self, category_level=1):
        X = list()
        Y = list()
        _count = dict()
        for line in read_json_format_file(self.corpus_file):
            if line:
                (_id, channel, title, content), y = self._preline(line)
                label = y[category_level - 1]
                _, title_char_feature, title_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                    title)
                _, content_char_feature, content_token_feature = self.clean_doc_obj.get_char_and_token_feature(
                    content)
                # 获取title和content的char特征
                char_feature = " ".join(
                    list(title_char_feature)) + " ### " + " ".join(
                        list(content_char_feature))
                # 获取title和content的token特征
                token_feature = " ".join(
                    list(title_token_feature)) + " ### " + " ".join(
                        list(content_token_feature))

                if len(content) > 30:
                    if label in _count.keys():
                        if _count[label] > 30000:
                            continue
                        else:
                            _count[label] += 1
                            X.append(token_feature + "\t" + char_feature +
                                     "\t" + channel)
                            Y.append(label)
                    else:
                        _count[label] = 1
                        X.append(token_feature + "\t" + char_feature + "\t" +
                                 channel)
                        Y.append(label)
                else:
                    continue
        return X, Y

    def stratified_sampling(self, x, y, valid_portion):
        """
        按标签类别个数分层切分训练集和验证集
        :param x:
        :param y:
        :param valid_portion:
        :return:
        """
        skf = StratifiedKFold(n_splits=int(1 / valid_portion))
        train = None
        dev = None

        index = [(train_index, test_index)
                 for train_index, test_index in skf.split(x, y)]

        train_label_count = self._label_count([y[i] for i in index[0][0]])
        test_label_count = self._label_count([y[j] for j in index[0][1]])
        print("train_label_count: {}".format(
            json.dumps(train_label_count, indent=4, ensure_ascii=False)))
        print("test_label_count: {}".format(
            json.dumps(test_label_count, indent=4, ensure_ascii=False)))
        train = [y[i] + "\t" + x[i] for i in index[0][0]]
        dev = [y[j] + "\t" + x[j] for j in index[0][1]]

        return train, dev

    def _label_dist_count(self,
                          level1=None,
                          level2=None,
                          level3=None,
                          dist_count=None):
        """
        统计标签分布计算
        :param level1:一级标签
        :param level2:二级标签
        :param level3:三级标签
        :param dist_count:标签分布字典
        :return:
        """
        if level1:
            if level1 in dist_count:
                dist_count[level1]["count"] += 1
            else:
                dist_count[level1] = dict()
                dist_count[level1]["count"] = 1
            if level2:
                if level2 in dist_count[level1]:
                    dist_count[level1][level2]["count"] += 1
                else:
                    dist_count[level1][level2] = dict()
                    dist_count[level1][level2]["count"] = 1
                if level3:
                    if level3 in dist_count[level1][level2]:
                        dist_count[level1][level2][level3] += 1
                    else:
                        dist_count[level1][level2][level3] = 1

    @staticmethod
    def dict_sort(result, limit_num=None):
        """
        字典排序, 返回有序字典
        :param result:
        :param limit_num:
        :return:
        """
        _result_sort = sorted(result.items(),
                              key=lambda x: x[1]["count"],
                              reverse=True)
        result_sort = OrderedDict()

        count_limit = 0
        domain_count = 0
        for i in _result_sort:
            if limit_num:
                if i[1] > limit_num:
                    result_sort[i[0]] = i[1]
                    domain_count += 1
                    count_limit += i[1]
            else:
                result_sort[i[0]] = i[1]
        return result_sort

    @staticmethod
    def _preline(line):
        """
        处理文件行(dict格式)
        """
        article_id = line.get("articleid", "")
        channel = line.get("channel", "")
        title = line.get("title", "")
        content = line.get("content", "")
        tagging_data = eval(line.get("tagscoredata", ""))
        category_data = tagging_data["categoryList"][0]["data"]
        top_category = ""
        sub_category = ""
        third_category = ""
        for item in category_data:
            if item["level"] == 1:
                top_category = str(item["name"])
            if item["level"] == 2:
                if "name" in item:
                    sub_category = str(item["name"].split("_")[-1])
            if item["level"] == 3:
                if "name" in item:
                    third_category = str(item["name"].split("_")[-1])
        return (article_id, channel, title,
                content), (top_category, sub_category, third_category)

    def _label_count(self, label_list):
        label_count = dict()
        for i in label_list:
            if label_count.get(i, None) is not None:
                label_count[i] += 1
            else:
                label_count[i] = 1
        return label_count

    def write_txt_file(self, data, file):
        """
        写数据到文件
        """
        print(">>>>> start writing file")
        with open(file, "w") as f:
            for line in data:
                f.write(line + "\n")
        print("<<<<< write down:{}".format(file))