Exemplo n.º 1
0
    def test_2_sentence_processing(self):
        sp = SentenceProcessing(dicts_folder=self.dicts_folder, output_folder=self.edges_folder,
                                max_window_size=self.max_window_size, local_dict_extension=config['graph']['local_dict_extension'])
        # sp.fromfile(self.dicts_folder + 'dict_AA_wiki_03.dicloc')
        word_count_all = sp.apply(data_folder=self.dicts_folder, process_num=self.process_num)

        # test the merged word count (with global id)
        merged_dict = util.read_two_columns_file_to_build_dictionary_type_specified(
            file=self.dicts_folder + 'dict_merged.txt', key_type=str, value_type=int)
        self.assertEqual(word_count_all[merged_dict['on']], 2)
        self.assertEqual(word_count_all[merged_dict['00']], 3)
        self.assertEqual(word_count_all[merged_dict[',']], 5)
Exemplo n.º 2
0
    def test_4_convert_encoded_edges_count_for_undirected_graph(self):
        def equal_test(word1, word2):
            if directed[(str(word2id[word1]), str(word2id[word2]))] \
                    and directed[(str(word2id[word2]), str(word2id[word1]))]:
                sum_count = directed[(str(word2id[word1]), str(word2id[word2]))] \
                            + directed[(str(word2id[word2]), str(word2id[word1]))]
            elif directed[(str(word2id[word1]), str(word2id[word2]))]:
                sum_count = directed[(str(word2id[word1]), str(word2id[word2]))]
            elif directed[(str(word2id[word2]), str(word2id[word1]))]:
                sum_count = directed[(str(word2id[word2]), str(word2id[word1]))]
            else:
                sum_count = 0

            if (str(word2id[word1]), str(word2id[word2])) in undirected:
                self.assertEqual(sum_count, undirected[(str(word2id[word1]), str(word2id[word2]))])
            elif (str(word2id[word2]), str(word2id[word1])) in undirected:
                self.assertEqual(sum_count, undirected[(str(word2id[word2]), str(word2id[word1]))])
            else:
                print('No direct edge between ' + word1 + ' and ' + word2)
                self.assertEqual(sum_count, 0)

        wpp = WordPairsProcessing(max_vocab_size=None, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=50,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        directed = wpp.apply(process_num=self.process_num)

        word2id = util.read_two_columns_file_to_build_dictionary_type_specified(
            file=self.dicts_folder + 'dict_merged.txt', key_type=str,
            value_type=int)

        undirected = wpp.convert_encoded_edges_count_for_undirected_graph(
            old_encoded_edges_count_path=
            self.graph_folder + "encoded_edges_count_window_size_" + str(self.max_window_size) + ".txt")

        equal_test('and', ',')
        equal_test('the', '.')
        equal_test('and', '.')
        equal_test('and', 'of')
        equal_test('in', 'of')
        equal_test('.', 'in')
        equal_test('.', ',')  # . and , are not directly connected.
Exemplo n.º 3
0
    def test_3_word_pairs_processing(self):
        # test valid vocabulary
        wpp = WordPairsProcessing(max_vocab_size=None, min_count=1,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), 94)

        wpp = WordPairsProcessing(max_vocab_size=None, min_count=3,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), 9)

        wpp = WordPairsProcessing(max_vocab_size=self.max_vocab_size, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), self.max_vocab_size)

        wpp = WordPairsProcessing(max_vocab_size=None, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        valid_vocab = wpp.write_valid_vocabulary()
        self.assertEqual(len(valid_vocab), 6)

        # test word pairs of a specific window size
        wpp = WordPairsProcessing(max_vocab_size=None, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        result = wpp.apply(process_num=self.process_num)
        word2id = util.read_two_columns_file_to_build_dictionary_type_specified(
            file=self.dicts_folder + 'dict_merged.txt', key_type=str, value_type=int)

        self.assertEqual(result[(str(word2id['and']), str(word2id[',']))], 2)
        self.assertEqual(result[(str(word2id['and']), str(word2id['.']))], 2)
        self.assertEqual(result[(str(word2id['and']), str(word2id['the']))], 1)

        self.assertEqual(result[(str(word2id['the']), str(word2id['of']))], 6)
        self.assertEqual(result[(str(word2id['the']), str(word2id['.']))], 2)
        self.assertEqual(result[(str(word2id['the']), str(word2id['and']))], 3)
        self.assertEqual(result[(str(word2id['the']), str(word2id['in']))], 1)
        self.assertEqual(result[(str(word2id['the']), str(word2id[',']))], 2)

        self.assertEqual(result[(str(word2id['of']), str(word2id['.']))], 3)
        self.assertEqual(result[(str(word2id['of']), str(word2id['the']))], 2)
        self.assertEqual(result[(str(word2id['of']), str(word2id['and']))], 3)
        self.assertEqual(result[(str(word2id['of']), str(word2id['in']))], 2)
        self.assertEqual(result[(str(word2id['of']), str(word2id[',']))], 1)

        self.assertEqual(result[(str(word2id['in']), str(word2id['.']))], 1)
        self.assertEqual(result[(str(word2id['in']), str(word2id['the']))], 5)
        self.assertEqual(result[(str(word2id['in']), str(word2id['and']))], 1)
        self.assertEqual(result[(str(word2id['in']), str(word2id[',']))], 1)

        self.assertEqual(result[(str(word2id[',']), str(word2id['and']))], 2)
        self.assertEqual(result[(str(word2id[',']), str(word2id['in']))], 1)
        self.assertEqual(result[(str(word2id[',']), str(word2id['the']))], 1)

        self.assertEqual(len(result), 20 + 3)  # 3 self loops

        wpp = WordPairsProcessing(max_vocab_size=self.max_vocab_size, min_count=self.min_count,
                                  dicts_folder=self.dicts_folder, window_size=self.max_window_size,
                                  edges_folder=self.edges_folder, graph_folder=self.graph_folder,
                                  safe_files_number_per_processor=config['graph']['safe_files_number_per_processor'])
        result = wpp.apply(process_num=self.process_num)
        self.assertEqual(result[(str(word2id['and']), str(word2id['the']))], 1)

        self.assertEqual(result[(str(word2id['the']), str(word2id['of']))], 6)
        self.assertEqual(result[(str(word2id['the']), str(word2id['and']))], 3)

        self.assertEqual(result[(str(word2id['of']), str(word2id['the']))], 2)
        self.assertEqual(result[(str(word2id['of']), str(word2id['and']))], 3)

        self.assertEqual(len(result), 5 + 2)  # 2 self loops
Exemplo n.º 4
0
class TestGraphBuilder(unittest.TestCase):
    """ ATTENTION
    Normally, data_folder and output_folder should be user defined paths (absolute paths).
    For unittest, as input and output folders locations are fixed, these two paths are exceptionally relative paths.
    """
    data_folder = '../test/output/keep/'

    # undirected paths
    encoded_edges_count_undirected_file = 'encoded_edges_count_window_size_6_vocab_size_none_undirected_for_unittest.txt'
    merged_dict_undirected_file = 'dict_merged_undirected_for_unittest.txt'
    word2wordId_undirected = read_two_columns_file_to_build_dictionary_type_specified(
        file=data_folder + merged_dict_undirected_file, key_type=str, value_type=int)
    valid_vocabulary_undirected_file = 'valid_vocabulary_min_count_5_undirected.txt'

    @staticmethod
    def get_matrix_value_by_token_xy(matrix, nodes, word2wordId, token_x, token_y):
        nodes = list(nodes)
        matrix_x = nodes.index(word2wordId[token_x])
        matrix_y = nodes.index(word2wordId[token_y])
        return matrix[matrix_x, matrix_y]

    def test_NoGraph_get_stochastic_matrix(self):
        no_graph = gb.NoGraph(self.data_folder + self.encoded_edges_count_undirected_file,
                              valid_vocabulary_path=self.data_folder + self.valid_vocabulary_undirected_file)
        matrix = no_graph.get_stochastic_matrix(remove_self_loops=True)

        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, no_graph.graph_index2wordId,
                                                          self.word2wordId_undirected,
                                                          '.', 'the') == 2 / (2 + 2 + 3 + 1))
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, no_graph.graph_index2wordId,
                                                          self.word2wordId_undirected,
                                                          'and', 'the') == 4 / (2 + 1 + 4 + 3 + 4))
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, no_graph.graph_index2wordId,
                                                          self.word2wordId_undirected,
                                                          'the', ',') == 3 / (3 + 4 + 2 + 6 + 8))
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, no_graph.graph_index2wordId,
                                                          self.word2wordId_undirected,
                                                          ',', '.') == 0)
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, no_graph.graph_index2wordId,
                                                          self.word2wordId_undirected,
                                                          'in', ',') == 2 / (2 + 2 + 6 + 1 + 1))

    def test_NXGraph_get_stochastic_matrix(self):
        # Undirected
        graph = gb.NXGraph.from_encoded_edges_count_file(
            path=self.data_folder + self.encoded_edges_count_undirected_file, directed=False)
        graph.print_graph_information()

        matrix = graph.get_stochastic_matrix(remove_self_loops=True)
        graph_index2wordId = graph.graph.nodes()

        # check weight based transition probability
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, graph_index2wordId, self.word2wordId_undirected,
                                                          '.', 'the') == 2 / (2 + 2 + 3 + 1))
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, graph_index2wordId, self.word2wordId_undirected,
                                                          'and', 'the') == 4 / (2 + 1 + 4 + 3 + 4))
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, graph_index2wordId, self.word2wordId_undirected,
                                                          'the', ',') == 3 / (3 + 4 + 2 + 6 + 8))
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, graph_index2wordId, self.word2wordId_undirected,
                                                          ',', '.') == 0)
        self.assertTrue(self.get_matrix_value_by_token_xy(matrix, graph_index2wordId, self.word2wordId_undirected,
                                                          'in', ',') == 2 / (2 + 2 + 6 + 1 + 1))

    def test_NoGraph_t_step_random_walk(self):
        no_graph = gb.NoGraph(self.data_folder + self.encoded_edges_count_undirected_file,
                              valid_vocabulary_path=self.data_folder + self.valid_vocabulary_undirected_file)
        # t=1 step random walk (= stochastic matrix)
        _, matrix11 = no_graph.get_t_step_random_walk_stochastic_matrix(t=1, remove_self_loops=True)
        # t=2 steps random walk
        _, matrix22 = no_graph.get_t_step_random_walk_stochastic_matrix(t=2, remove_self_loops=True)

        # check the calculation of cell value.
        value_sum = 0
        for i in range(6):
            value_sum += matrix11[3, i] * matrix11[i, 5]
        np.testing.assert_array_almost_equal(value_sum, matrix22[3, 5])

        # t=3 steps random walk
        _, matrix33 = no_graph.get_t_step_random_walk_stochastic_matrix(t=3, remove_self_loops=True)

        # check the sum of each line in matrix equals to 1
        for i in range(0, matrix33.shape[0]):
            np.testing.assert_array_almost_equal(np.sum(matrix33[i]), 1)
        # check the calculation of cell value.
        value_sum = 0
        for i in range(6):
            value_sum += matrix22[3, i] * matrix11[i, 5]  # matrix1 is the transition matrix
        np.testing.assert_array_almost_equal(value_sum, matrix33[3, 5])

    def test_NXGraph_t_step_random_walk(self):
        graph = gb.NXGraph.from_encoded_edges_count_file(
            path=self.data_folder + self.encoded_edges_count_undirected_file, directed=False)
        # t=1 step random walk
        _, matrix1 = graph.get_t_step_random_walk_stochastic_matrix(t=1, remove_self_loops=True)
        # t=2 steps random walk
        _, matrix2 = graph.get_t_step_random_walk_stochastic_matrix(t=2, remove_self_loops=True)

        # check the calculation of cell value.
        value_sum = 0
        for i in range(6):
            value_sum += matrix1[3, i] * matrix1[i, 5]
        self.assertTrue(value_sum == matrix2[3, 5])

        # t=3 steps random walk
        _, matrix3 = graph.get_t_step_random_walk_stochastic_matrix(t=3, remove_self_loops=True)

        # check the sum of each line in matrix equals to 1
        for i in range(0, matrix3.shape[0]):
            self.assertTrue(np.sum(matrix3[i]) == 1.0)
        # check the calculation of cell value.
        value_sum = 0
        for i in range(6):
            value_sum += matrix2[3, i] * matrix1[i, 5]  # matrix1 is the transition matrix
        self.assertTrue(value_sum == matrix3[3, 5])