Exemplo n.º 1
0
def compare_time_cost(size):
    list(jieba.cut("先加载词典"))
    time_count("init", print_to_console=False)

    sentences = TextIO().get_text_from_mongo(isRandom=False, limit=size)

    # time_count("get sentences")

    cg = CorpusGraph()
    cg.load_from_json()
    # cg.cache_reverse()
    time_count("build corpus graph")

    tg = TextGraph()
    tg.build(sentences, cg)
    time_count(print_to_console=False)
    # tg.fill_edge(cg)
    # time_count("fill edge")
    rs = tg.cut()
    time_count("time cost")
    # print(rs)

    # sentences 是生成器,上面的sentences已经被消耗
    sentences = TextIO().get_text_from_mongo(isRandom=False, limit=size)
    jieba_rs = []
    time_count(print_to_console=False)

    for s in sentences:
        words_gen = jieba.cut(s)
        words = list(words_gen)
        jieba_rs.append(words)

    time_count("jieba time cost")
Exemplo n.º 2
0
    def __init__(self):
        self.text_io = TextIO()
        self.text = nx.DiGraph()
        self.id_char_map = {}
        self.sentence_cnt = 0

        # 每句话的开头
        self.headers = []
Exemplo n.º 3
0
class TextGraph:
    def __init__(self):
        self.text_io = TextIO()
        self.text = nx.DiGraph()
        self.id_char_map = {}
        self.sentence_cnt = 0

        # 每句话的开头
        self.headers = []

    def get_sentences(self, isRandom=True):
        ss = self.text_io.get_text_from_mongo(isRandom=isRandom)
        return ss

    def build(self, sentences):
        sentence_index = 10000
        if type(sentences) != list:
            raise Exception("输入应是句子列表")
        for s in sentences:
            s = s.strip()
            s_size = len(s)
            is_header = True
            for char_index in range(s_size):
                char = s[char_index]
                id = sentence_index + char_index
                self.text.add_node(id)
                if is_header:
                    self.headers.append(id)
                    is_header = False

                self.id_char_map[id] = char
                if char_index < s_size - 1:
                    self.text.add_edge(id, id + 1)
            sentence_index += 10000

    def fill_edge(self, corpus):
        edges = self.text.edges()
        for edge in edges:
            char_start = self.id_char_map[edge[0]]
            char_end = self.id_char_map[edge[1]]
            weight = corpus.get_edge_weight(char_start, char_end)
            self.text[edge[0]][edge[1]]['weight'] = weight

    def make_json(self, corpus, path='./data/text.json'):
        time_count("make_json", print_to_console=False)
        text_json = {}
        i = 0

        for start_id, nbr in self.text.adj.items():
            start_char = self.id_char_map[start_id]
            end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
            out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0
            # print(start_char, nbr[start_id+1]['weight'] if start_id + 1 in nbr else 0, out_weight)
            nbr_out = corpus.get_sorted_neighbour(start_char, end_char)
            # nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True)
            text_json[i] = {"char": start_char, "outWeight": out_weight, "neighbour_out": nbr_out, "neighbour_in": None}
            i += 1
        time_count("获取后接词")

        i = 0
        corpus.reverse()
        for start_id, nbr in self.text.adj.items():
            start_char = self.id_char_map[start_id]
            end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
            # out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0
            # print(start_char, nbr[start_id+1]['weight'] if start_id + 1 in nbr else 0, out_weight)
            # nbr_out = corpus.get_sorted_neighbour(start_char, end_char)
            nbr_in = corpus.get_sorted_neighbour(start_char, end_char)
            text_json[i]["neighbour_in"] = nbr_in
            i += 1
        corpus.reverse()
        time_count("获取前接词")

        # def get_next(item):
        #     global i
        #     global text_json
        #     start_id = item[0]
        #     nbr = item[1]
        #     start_char = self.id_char_map[start_id]
        #     end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
        #     out_weight = nbr[start_id + 1]['weight'] if start_id + 1 in nbr else 0
        #     nbr_out = corpus.get_sorted_neighbour(start_char, end_char)
        #     # nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True)
        #     text_json[i] = {"char": start_char, "outWeight": out_weight, "neighbour_out": nbr_out,
        #                     "neighbour_in": ""}
        #     i += 1
        #
        # def get_previous(item):
        #     global text_json
        #     start_id = item[0]
        #     nbr = item[1]
        #     start_char = self.id_char_map[start_id]
        #     end_char = self.id_char_map[start_id + 1] if start_id + 1 in nbr else None
        #     nbr_in = corpus.get_sorted_neighbour(start_char, end_char, reverse=True)
        #     text_json[i]["neighbour_in"] = nbr_in
        #
        # items = self.text.adj.items()
        # map(get_next, items)
        # time_count("获取后接字")
        # corpus.reverse()
        # map(get_previous, items)
        # corpus.reverse()
        # time_count("获取前接字")

        if path is not None:
            json.dump(text_json, open(path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4)
            print("text json ready at: " + path)
        return text_json

    # 按照阈值切边分词
    def cut(self):
        adj = self.text.adj
        rs = []
        for header in self.headers:
            current = header
            pre_weight = 0
            buffer_word = ""
            words = []
            while current in adj:
                current_char = self.id_char_map[current]
                # print("=>" + current_char)

                # 当前字出边的权重
                current_weight = self.text[current][current + 1]['weight'] if current + 1 in adj else 0

                # 当前字出边权重为0,说明当前字是词尾
                if current_weight == 0:
                    buffer_word += str(current_char)
                    if is_chinese(buffer_word):
                        words.append(buffer_word)
                    buffer_word = ""
                else:
                    # 这里的阈值可以修改,pre_weight是当前字的入边
                    if pre_weight / current_weight < 0.7:
                        if is_chinese(buffer_word):
                            words.append(buffer_word)
                        buffer_word = current_char
                    elif pre_weight / current_weight > 1.4:
                        buffer_word += current_char
                        if is_chinese(buffer_word):
                            words.append(buffer_word)
                        buffer_word = ""
                    else:
                        buffer_word += current_char

                # print("%f\t\t\tbuffer:%s |" % (current_weight, buffer_word))
                pre_weight = current_weight
                current += 1
                # print(words)
            rs.append(words)
        return rs
Exemplo n.º 4
0
class TextGraph:
    def __init__(self):
        self.text_io = TextIO()
        self.text = nx.DiGraph()
        self.id_char_map = {}
        self.sentence_cnt = 0

    def get_sentences(self, isRandom=True):
        ss = self.text_io.get_text_from_mongo(isRandom=isRandom)
        return ss
        # self.build(ss)

    def build(self, sentences):
        sentence_index = 10000
        for s in sentences:
            s = s.strip()
            s_size = len(s)
            for char_index in range(s_size):
                char = s[char_index]
                id = sentence_index + char_index
                self.text.add_node(id)
                self.id_char_map[id] = char
                if char_index < s_size - 1:
                    self.text.add_edge(id, id + 1)
            sentence_index += 10000

    def fill_edge(self, corpus):
        edges = self.text.edges()
        for edge in edges:
            char_start = self.id_char_map[edge[0]]
            char_end = self.id_char_map[edge[1]]
            weight = corpus.get_edge_weight(char_start, char_end)
            self.text[edge[0]][edge[1]]['weight'] = weight
            # print(char_start, char_end, weight)
            # print(edges)

    def make_json(self, corpus, path='./data/text.json'):
        # edges = self.text.edges()
        text_json = {}
        i = 0
        for start_id, nbr in self.text.adj.items():
            start_char = self.id_char_map[start_id]
            end_char = self.id_char_map[start_id +
                                        1] if start_id + 1 in nbr else None
            out_weight = nbr[start_id +
                             1]['weight'] if start_id + 1 in nbr else 0
            # print(start_char, nbr[start_id+1]['weight'] if start_id + 1 in nbr else 0, out_weight)
            nbr = corpus.get_sorted_neighbour(start_char, end_char)
            text_json[i] = {
                "char": start_char,
                "outWeight": out_weight,
                "neighbour": nbr
            }
            i += 1
        json.dump(text_json,
                  open(path, 'w', encoding='utf-8'),
                  ensure_ascii=False,
                  indent=4)
        print("text json ready at: " + path)

    def draw(self):
        # nx.draw_networkx(self.text, font_family='SimHei', node_color='white')
        # nx.draw_spring(self.text, font_family='SimHei', node_color='white')
        nx.draw_shell(self.text, font_family='SimHei', node_color='black')
        plt.show()