def tokenize():
    if request.method == 'GET':
        tg = TextGraph()
        sentence = "没有输入"

        # 从参数获取待分词句子
        if request.args.get('sentence', '') != "":
            sentence = request.args.get('sentence', '')
        tg.build([sentence])
        tg.fill_edge(cg)

        # 暂时只对单句分词
        time_count(print_to_console=False)
        result = tg.cut()[0]
        time_count("分词完毕")
        check_jieba = jieba_checker.check(sentence, result)
        time_count("jieba分词完毕")
        check_thulac = thulac_checker.check(sentence, result)
        time_count("thulac分词完毕")

        # jieba的分词结果
        jieba_result = check_jieba["jieba_result"]
        jieba_overlap = check_jieba["overlap"]

        thulac_result = check_thulac["thulac_result"]
        thulac_overlap = check_thulac["overlap"]
        # res = json.dumps(
        #     {"graph": tg.make_json(cg, path=None), "result": result,
        #      "jieba": jieba_result, "jieba_overlap": jieba_overlap,
        #      "thulac": thulac_result, "thulac_overlap": thulac_overlap},
        #     ensure_ascii=False)
        res = json.dumps(
            {
                "graph": tg.make_json(cg, path=None),
                "result": result,
                "jieba": {
                    "words": jieba_result,
                    "overlap": "%.2f" % jieba_overlap
                },
                "thulac": {
                    "words": thulac_result,
                    "overlap": "%.2f" % thulac_overlap
                }
            },
            ensure_ascii=False)
        # print("json dumping")
        # res = json.dumps(
        #     {"graph": tg.make_json(cg, path=None), "result": result,
        #      "jieba": jieba_result, "jieba_overlap": jieba_overlap,
        #      },
        #     ensure_ascii=False)
        print("server returned")
        return res
示例#2
0
def compare_time_cost(size):
    list(jieba.cut("先加载词典"))
    time_count("init", print_to_console=False)

    sentences = TextIO().get_text_from_mongo(isRandom=False, limit=size)

    # time_count("get sentences")

    cg = CorpusGraph()
    cg.load_from_json()
    # cg.cache_reverse()
    time_count("build corpus graph")

    tg = TextGraph()
    tg.build(sentences, cg)
    time_count(print_to_console=False)
    # tg.fill_edge(cg)
    # time_count("fill edge")
    rs = tg.cut()
    time_count("time cost")
    # print(rs)

    # sentences 是生成器,上面的sentences已经被消耗
    sentences = TextIO().get_text_from_mongo(isRandom=False, limit=size)
    jieba_rs = []
    time_count(print_to_console=False)

    for s in sentences:
        words_gen = jieba.cut(s)
        words = list(words_gen)
        jieba_rs.append(words)

    time_count("jieba time cost")
示例#3
0
    def make_json(self, corpus, path='./data/text.json'):
        time_count("make_json", print_to_console=False)
        text_json = {}
        i = 0

        for start_id, nbr in self.text.adj.items():
            start_char = self.id_char_map[start_id]
            end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
            out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0
            # print(start_char, nbr[start_id+1]['weight'] if start_id + 1 in nbr else 0, out_weight)
            nbr_out = corpus.get_sorted_neighbour(start_char, end_char)
            # nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True)
            text_json[i] = {"char": start_char, "outWeight": out_weight, "neighbour_out": nbr_out, "neighbour_in": None}
            i += 1
        time_count("获取后接词")

        i = 0
        corpus.reverse()
        for start_id, nbr in self.text.adj.items():
            start_char = self.id_char_map[start_id]
            end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
            # out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0
            # print(start_char, nbr[start_id+1]['weight'] if start_id + 1 in nbr else 0, out_weight)
            # nbr_out = corpus.get_sorted_neighbour(start_char, end_char)
            nbr_in = corpus.get_sorted_neighbour(start_char, end_char)
            text_json[i]["neighbour_in"] = nbr_in
            i += 1
        corpus.reverse()
        time_count("获取前接词")

        # def get_next(item):
        #     global i
        #     global text_json
        #     start_id = item[0]
        #     nbr = item[1]
        #     start_char = self.id_char_map[start_id]
        #     end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
        #     out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0
        #     nbr_out = corpus.get_sorted_neighbour(start_char, end_char)
        #     # nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True)
        #     text_json[i] = {"char": start_char, "outWeight": out_weight, "neighbour_out": nbr_out,
        #                     "neighbour_in": ""}
        #     i += 1
        #
        # def get_previous(item):
        #     global text_json
        #     start_id = item[0]
        #     nbr = item[1]
        #     start_char = self.id_char_map[start_id]
        #     end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
        #     nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True)
        #     text_json[i]["neighbour_in"] = nbr_in
        #
        # items = self.text.adj.items()
        # map(get_next, items)
        # time_count("获取后接字")
        # corpus.reverse()
        # map(get_previous, items)
        # corpus.reverse()
        # time_count("获取前接字")

        if path is not None:
            json.dump(text_json, open(path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4)
            print("text json ready at: " + path)
        return text_json