示例#1
0
    def get_data_example_to_file(self, file, out_file):
        """
        从原始数据中每个样本抽取1000到文件
        :param file: 原始文件
        :param out_file: example文件
        :return:
        """
        lines = read_json_format_file(file)
        all_data = dict()

        for line in lines:
            if not line:
                continue
            title = line["title"]
            label = line["label"]
            if label in all_data:
                all_data[label].append(title)
            else:
                all_data[label] = list()
                all_data[label].append(title)

        with open(out_file, "w", encoding="utf-8") as f:
            for k, v in all_data.items():
                random.shuffle(v)
                i = 0
                for text in v:
                    out = dict()
                    if len(text) > 5 and len(text) < 50:
                        out["label"] = k
                        out["text"] = text
                        f.write(json.dumps(out, ensure_ascii=False) + "\n")
                        i += 1
                        if i >= 1000:
                            break
示例#2
0
def write_embed_file(corpus_data_path):
    """
    处理为embed训练文件,总文件过大无法全部加入进行shuffle,
    所以采取分层抽取1/4数据写入文件
    :return:
    """
    embed_file = os.path.join(corpus_data_path, "word_embed.txt")
    ef = open(embed_file, 'w')
    _doc_count = 0
    for _file in os.listdir(corpus_data_path):
        if _file.startswith("part"):
            file = os.path.join(corpus_data_path, _file)
            for doc in read_json_format_file(file):
                _doc_count += 1
                if _doc_count % 100000 == 0:
                    print(">>>>> 已处理{}篇文档".format(_doc_count))
                if _doc_count % 4 == 0:
                    if 'title' and 'content' in doc.keys():
                        title = doc["title"].strip().replace("\t", " ").replace("\n", " ").replace("\r", " ")
                        content = doc["content"].strip()
                        text = title + " " + content
                        word_list = split_text(text)
                        out_line = " ".join(word_list)
                        ef.write(out_line + "\n")
                else:
                    continue
    ef.close()
    print("<<<<< 【{}】embed文件已生成".format(embed_file))
示例#3
0
 def pre_corpus(self, ori_file, label_file, corpus_file, url_file):
     """
     将语料处理为带label
     :param ori_file:
     :param label_file:
     :param corpus_file:
     :param url_file:
     :return:
     """
     lines = read_json_format_file(ori_file)
     with open(label_file, "r", encoding="utf-8") as f:
         url2label_dict = json.load(f)
     labels = dict()
     with open(corpus_file, "w",
               encoding="utf-8") as cf, open(url_file,
                                             "w",
                                             encoding="utf-8") as uf:
         for line in lines:
             url = line["url"]
             url_parse = urlparse(url)
             netloc = url_parse.netloc
             label = self.extract_label(netloc, url2label_dict)
             if label not in ["其他"]:
                 line["category"] = label
                 if label != "":
                     cf.write(json.dumps(line, ensure_ascii=False) + "\n")
                     if label not in labels:
                         labels[label] = 1
                     else:
                         labels[label] += 1
                 else:
                     uf.write(url + "\n")
             del url
         print(labels)
示例#4
0
def test_read_data():
    corpus_data_path = "/data/in_hi_news/raw_data/raw_data"
    file = os.path.join(corpus_data_path, "part-00000-69676dc0-8d50-4410-864d-79709f3f4960-c000.json")
    _doc_count = 0
    for doc in read_json_format_file(file):
        _doc_count += 1
        if _doc_count == 10:
            break
        print(doc)
示例#5
0
 def get_pred_result_from_file(self):
     y_true = list()
     y_pred = list()
     self.log.info("正在从预测文件获取结果...")
     for line in read_json_format_file(self.pred_file):
         true_category = str(line[self.kn]).lower().strip()
         y_true.append(true_category)
         pred_category = str(line['predict_{}'.format(self.kn)]).lower().strip()
         y_pred.append(pred_category)
     return y_true, y_pred
示例#6
0
def get_embed_from_rawfile(file):
    """
    直接从原始文件生成词向量训练语料
    :param file: 原始数据文件
    :return:
    """
    print(">>>>> 正在获取分词list语料")
    doc_word_list = list()
    _doc_count = 0
    for doc in read_json_format_file(file):
        _doc_count += 1
        if _doc_count % 100000 == 0:
            print(">>>>> 已处理{}篇文档".format(_doc_count))
        clean_doc = clean_text(doc)
        if clean_doc:
            doc_word_list.append(clean_doc)
        else:
            continue
    return doc_word_list
示例#7
0
    def get_data_example_to_file(self, file, out_file):
        """
        从原始数据中每个样本抽取1000到文件
        :param file: 原始文件
        :param out_file: example文件
        :return:
        """
        lines = read_json_format_file(file)
        all_data = dict()

        for line in lines:
            if not line:
                continue
            title = line["title"]
            label = line["category"]
            if label in all_data:
                all_data[label].append(title)
            else:
                all_data[label] = list()
                all_data[label].append(title)

        with open(out_file, "w", encoding="utf-8") as f:
            for k, v in all_data.items():
                if k in [
                        "日本", "海南", "俄罗斯", "重庆", "韩国", "福建", "江苏", "广西", "安徽",
                        "四川", "世界", "青海", "地方", "潍坊", "陕西", "天气", "评论", "读书",
                        "媒体", "访谈", "奥运", "视频", "人民日报", "考试"
                ]:
                    continue
                random.shuffle(v)
                print("label:{}  size:{}".format(k, len(v)))
                i = 0
                for text in v:
                    out = dict()
                    if len(text) > 5 and len(text) < 50:
                        out["label"] = k
                        out["text"] = text
                        f.write(json.dumps(out, ensure_ascii=False) + "\n")
                        i += 1
                        if i >= 1000:
                            break
示例#8
0
def write_embed_file(corpus_data_path):
    """
    处理为embed训练文件,清洗文本
    所以采取分层抽取1/4数据写入文件
    :return:
    """
    raw_file = os.path.join(corpus_data_path, "raw_data")
    embed_file = os.path.join(corpus_data_path, "word_embed.txt")
    ef = open(embed_file, 'w')
    _doc_count = 0
    for doc in read_json_format_file(raw_file):
        _doc_count += 1
        if _doc_count % 100000 == 0:
            print(">>>>> 已处理{}篇文档".format(_doc_count))
        clean_doc = clean_text(doc)
        if clean_doc:
            ef.write(clean_doc + "\n")
        else:
            continue
    ef.close()
    print("<<<<< 【{}】embed文件已生成".format(embed_file))
示例#9
0
    def analysis_data(self, corpus_file):
        """
        分析数据分布情况,
        :param corpus_file:
        :return:
        """
        label_count = dict()
        lines = read_json_format_file(corpus_file)
        data_len = list()
        for line in lines:
            label = line["label"]
            title = line["title"]
            data_len.append(len(title))
            if label in label_count:
                label_count[label] += 1
            else:
                label_count[label] = 1

        print("数据 label 分布情况:")
        print(json.dumps(label_count, ensure_ascii=False, indent=4))
        self.plot_text_length(data_len)
示例#10
0
def get_embed_from_rawfile(file):
    """
    直接从原始文件生成词向量训练语料
    :param file: 原始数据文件
    :return:
    """
    print(">>>>> 正在获取分词list语料")
    doc_word_list = list()
    _doc_count = 0
    for doc in read_json_format_file(file):
        _doc_count += 1
        if _doc_count % 100000 == 0:
            print(">>>>> 已处理{}篇文档".format(_doc_count))
        if 'title' and 'content' in doc.keys():
            title = doc["title"].strip().replace("\t", " ").replace("\n", " ").replace("\r", " ")
            content = doc["content"].strip()
            text = title + " " + content
            word_list = split_text(text)
            doc_word_list.append(word_list)
        else:
            continue
    return doc_word_list
示例#11
0
    def analysis_url(self, file, outfile1, outfile2):
        """
        分析url,查看可提供的label
        :param file:
        :param outfile1:
        :param outfile2:
        :return:
        """
        lines = read_json_format_file(file)
        scheme_dict = dict()
        domain_dict = dict()
        label_dict = dict()
        for line in lines:
            url = line["url"]
            url_parse = urlparse(url)
            scheme = url_parse.scheme
            if scheme not in scheme_dict:
                scheme_dict[scheme] = 1
            else:
                scheme_dict[scheme] += 1
            netloc = url_parse.netloc
            if netloc not in domain_dict:
                domain_dict[netloc] = 1
            else:
                domain_dict[netloc] += 1
            labels = netloc.split(".")
            for label in labels:
                if label not in label_dict:
                    label_dict[label] = 1
                else:
                    label_dict[label] += 1

        print(scheme_dict)
        print(domain_dict)
        print(label_dict)
        with open(outfile1, "w") as f:
            f.writelines(json.dumps(domain_dict, indent=4))
        with open(outfile2, "w") as f:
            f.writelines(json.dumps(label_dict, indent=4))
示例#12
0
 def get_data(self):
     X = list()
     Y = list()
     _count = dict()
     for line in read_json_format_file(self.f1):
         if line:
             # result = self._preline(line)
             x, y = self.preline(line)
             if len(x) > 30:
                 if y in _count.keys():
                     if _count[y] > 50000:
                         continue
                     else:
                         _count[y] += 1
                         X.append(x)
                         Y.append(y)
                 else:
                     _count[y] = 1
                     X.append(x)
                     Y.append(y)
             else:
                 continue
     return X, Y