示例#1
0
def make_local_mongo():
    corpusio = CorpusIO()
示例#2
0
 def __init__(self):
     self.corpus = nx.DiGraph()
     self.reversed_corpus_cache = None
     self.corpus_io = CorpusIO()
示例#3
0
class CorpusGraph:
    def __init__(self):
        self.corpus = nx.DiGraph()
        self.reversed_corpus_cache = None
        self.corpus_io = CorpusIO()

    # 需要mongodb
    def build_corpus(self):
        edges_gen = self.corpus_io.read_from_mongo(limit=None)
        for edge in edges_gen:
            self.corpus.add_edge(edge[0], edge[1], weight=edge[2])

    # 将语料库的networkx实例转为json
    def to_json(self):
        json_obj = nx.to_dict_of_dicts(self.corpus)
        return json_obj

    # 将语料库的networkx实例存入硬盘,以json文件的形式
    def save_as_json(self, path='./data/corpus.json'):
        json_obj = self.to_json()
        self.corpus_io.save_as_json(json_obj, path)

    # 从json文件读取一个networkx的语料库实例
    def load_from_json(self, path='./data/corpus_in_use.json'):
        print("loading corpus json file: " + str(path))
        json_obj = self.corpus_io.load_as_json(path)
        print("loaded")
        self.corpus = nx.from_dict_of_dicts(json_obj, create_using=self.corpus)

    def get_edge_weight(self, start, end):
        weight = 0
        try:
            weight = self.corpus[start][end]['weight']
        except KeyError:
            pass
        return weight

    def reverse(self):
        if self.reversed_corpus_cache is None:
            self.reversed_corpus_cache = self.corpus.reverse()

        tmp = self.corpus
        self.corpus = self.reversed_corpus_cache
        self.reversed_corpus_cache = tmp

    # 对于给定的字(key),取前K个最大的后接字
    def get_sorted_neighbour(self, key, exclude=None, K=6):
        corpus = self.corpus
        # if reverse:
        #     corpus = self.corpus.reverse()

        if key not in corpus.adj:
            return []

        nbr = corpus.adj[key]
        rs = []
        # print(nbr)
        # ########### 只需要获得前K个最大值,这里的排序可以优化(堆排序/K次冒泡排序...) ####################
        sorted_nbr = sorted(nbr.items(), key=lambda item: item[1]['weight'], reverse=True)

        j = 0
        for i in range(K - 1):
            if j >= len(sorted_nbr):
                break

            # 循环K次,如果相邻字正好是下一个字,则跳过这个相邻字
            if sorted_nbr[j][0] == exclude:
                j += 1

            if j >= len(sorted_nbr):
                break

            rs.append((sorted_nbr[j][0], sorted_nbr[j][1]['weight']))
            j += 1

        remain_cnt = 0
        remain_weight = 0
        for i in range(K - 1, len(sorted_nbr)):
            if sorted_nbr[i][0] == exclude:
                continue
            remain_cnt += 1
            remain_weight += sorted_nbr[i][1]['weight']

        rs.append(("+" + str(remain_cnt), remain_weight))

        return rs
示例#4
0
def make_local_mongo():
    corpusio = CorpusIO()
    corpusio.fetch_sentences_from_remote()
 def __init__(self):
     self.corpus = nx.DiGraph()
     self.corpus_io = CorpusIO()
class CorpusGraph:
    def __init__(self):
        self.corpus = nx.DiGraph()
        self.corpus_io = CorpusIO()

    def build_corpus(self):
        edges_gen = self.corpus_io.read_from_mongo(limit=None)
        for edge in edges_gen:
            self.corpus.add_edge(edge[0], edge[1], weight=edge[2])

    def draw(self):
        nx.draw_networkx(self.corpus, font_family='SimHei', node_color='white')
        plt.show()

    def to_json(self):
        json = nx.to_dict_of_dicts(self.corpus)
        return json

    def save_as_json(self, path='./data/corpus.json'):
        json = self.to_json()
        self.corpus_io.save_as_json(json, path)

    def load_from_json(self, path='./data/corpus.json'):
        json = self.corpus_io.load_as_json(path)
        self.corpus = nx.from_dict_of_dicts(json, create_using=self.corpus)

    def get_edge_weight(self, start, end):
        weight = 0
        try:
            weight = self.corpus[start][end]['weight']
        except KeyError:
            pass
        return weight

    def get_sorted_neighbour(self, key, exclude=None, K=6):
        if key not in self.corpus.adj:
            return []

        nbr = self.corpus.adj[key]
        rs = []
        # print(nbr)
        # ###########只需要获得前K个最大值,这里的排序可以优化####################
        sorted_nbr = sorted(nbr.items(),
                            key=lambda item: item[1]['weight'],
                            reverse=True)

        j = 0
        for i in range(K - 1):
            if sorted_nbr[j][0] == exclude:
                j += 1
            rs.append((sorted_nbr[j][0], sorted_nbr[j][1]['weight']))
            j += 1

        remain_cnt = 0
        remain_weight = 0
        for i in range(K - 1, len(sorted_nbr)):
            if sorted_nbr[i][0] == exclude:
                continue
            remain_cnt += 1
            remain_weight += sorted_nbr[i][1]['weight']

        rs.append(("+" + str(remain_cnt), remain_weight))
        return rs