def tokenize(): if request.method == 'GET': tg = TextGraph() sentence = "没有输入" # 从参数获取待分词句子 if request.args.get('sentence', '') != "": sentence = request.args.get('sentence', '') tg.build([sentence]) tg.fill_edge(cg) # 暂时只对单句分词 time_count(print_to_console=False) result = tg.cut()[0] time_count("分词完毕") check_jieba = jieba_checker.check(sentence, result) time_count("jieba分词完毕") check_thulac = thulac_checker.check(sentence, result) time_count("thulac分词完毕") # jieba的分词结果 jieba_result = check_jieba["jieba_result"] jieba_overlap = check_jieba["overlap"] thulac_result = check_thulac["thulac_result"] thulac_overlap = check_thulac["overlap"] # res = json.dumps( # {"graph": tg.make_json(cg, path=None), "result": result, # "jieba": jieba_result, "jieba_overlap": jieba_overlap, # "thulac": thulac_result, "thulac_overlap": thulac_overlap}, # ensure_ascii=False) res = json.dumps( { "graph": tg.make_json(cg, path=None), "result": result, "jieba": { "words": jieba_result, "overlap": "%.2f" % jieba_overlap }, "thulac": { "words": thulac_result, "overlap": "%.2f" % thulac_overlap } }, ensure_ascii=False) # print("json dumping") # res = json.dumps( # {"graph": tg.make_json(cg, path=None), "result": result, # "jieba": jieba_result, "jieba_overlap": jieba_overlap, # }, # ensure_ascii=False) print("server returned") return res
def compare_time_cost(size): list(jieba.cut("先加载词典")) time_count("init", print_to_console=False) sentences = TextIO().get_text_from_mongo(isRandom=False, limit=size) # time_count("get sentences") cg = CorpusGraph() cg.load_from_json() # cg.cache_reverse() time_count("build corpus graph") tg = TextGraph() tg.build(sentences, cg) time_count(print_to_console=False) # tg.fill_edge(cg) # time_count("fill edge") rs = tg.cut() time_count("time cost") # print(rs) # sentences 是生成器,上面的sentences已经被消耗 sentences = TextIO().get_text_from_mongo(isRandom=False, limit=size) jieba_rs = [] time_count(print_to_console=False) for s in sentences: words_gen = jieba.cut(s) words = list(words_gen) jieba_rs.append(words) time_count("jieba time cost")
def make_json(self, corpus, path='./data/text.json'): time_count("make_json", print_to_console=False) text_json = {} i = 0 for start_id, nbr in self.text.adj.items(): start_char = self.id_char_map[start_id] end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0 # print(start_char, nbr[start_id+1]['weight'] if start_id + 1 in nbr else 0, out_weight) nbr_out = corpus.get_sorted_neighbour(start_char, end_char) # nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True) text_json[i] = {"char": start_char, "outWeight": out_weight, "neighbour_out": nbr_out, "neighbour_in": None} i += 1 time_count("获取后接词") i = 0 corpus.reverse() for start_id, nbr in self.text.adj.items(): start_char = self.id_char_map[start_id] end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None # out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0 # print(start_char, nbr[start_id+1]['weight'] if start_id + 1 in nbr else 0, out_weight) # nbr_out = corpus.get_sorted_neighbour(start_char, end_char) nbr_in = corpus.get_sorted_neighbour(start_char, end_char) text_json[i]["neighbour_in"] = nbr_in i += 1 corpus.reverse() time_count("获取前接词") # def get_next(item): # global i # global text_json # start_id = item[0] # nbr = item[1] # start_char = self.id_char_map[start_id] # end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None # out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0 # nbr_out = corpus.get_sorted_neighbour(start_char, end_char) # # nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True) # text_json[i] = {"char": start_char, "outWeight": out_weight, "neighbour_out": nbr_out, # "neighbour_in": ""} # i += 1 # # def get_previous(item): # global text_json # start_id = item[0] # nbr = item[1] # start_char = self.id_char_map[start_id] # end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None # nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True) # text_json[i]["neighbour_in"] = nbr_in # # items = self.text.adj.items() # map(get_next, items) # time_count("获取后接字") # corpus.reverse() # map(get_previous, items) # corpus.reverse() # time_count("获取前接字") if path is not None: json.dump(text_json, open(path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) print("text json ready at: " + path) return text_json