Exemplo n.º 1
0
    def test_graph_tool_wrapper(self):
        start_time = time.time()
        wg = wordpair_generator.WordsGenerator(window_size=self.max_window_size, file_parser='txt',
                                               xml_node_path=None, word_tokenizer='', wtokenizer=Tokenizer.mytok,
                                               remove_numbers=False, remove_punctuations=False,
                                               stem_word=False, lowercase=False)

        gtw = graph_tool_wrapper.GraphToolWrapper('Test')
        for w1,w2 in wg(self.data_folder+'/tmp_dir/'):
            gtw.addPairs(w1, w2)
        print(util.count_time(start_time))
Exemplo n.º 2
0
                    wtokenizer=Tokenizer.mytok,
                    remove_numbers=False,
                    remove_punctuations=False,
                    stem_word=False,
                    lowercase=False)
merged_dict = wp.apply(data_folder=data_folder, process_num=process_num)

sp = SentenceProcessing(
    dicts_folder=dicts_folder,
    output_folder=edges_folder,
    max_window_size=max_window_size,
    local_dict_extension=config['graph']['local_dict_extension'])
word_count_all = sp.apply(data_folder=dicts_folder, process_num=process_num)

wpp = WordPairsProcessing(max_vocab_size=max_vocab_size,
                          min_count=min_count,
                          dicts_folder=dicts_folder,
                          window_size=max_window_size,
                          edges_folder=edges_folder,
                          graph_folder=graph_folder,
                          safe_files_number_per_processor=config['graph']
                          ['safe_files_number_per_processor'])
result = wpp.apply(process_num=process_num)

# igt = networkx_wrapper.IGraphWrapper('Test')
# igt.add_edges_from_file(path=graph_folder+'encoded_edges_count_window_size_5.txt')
print('[corpus2graph] time in seconds:', util.count_time(start_time))
gtw = graph_tool_wrapper.GraphToolWrapper('Test')
gtw.addEdgesFromFile(path=graph_folder +
                     'encoded_edges_count_window_size_5.txt')
print('[corpus2graph] time in seconds:', util.count_time(start_time))
Exemplo n.º 3
0
def main():
    arguments = docopt(__doc__, version='1.O.O')

    data_folder = arguments['<data_dir>']
    if not data_folder.endswith('/'):
        data_folder += '/'
    output_folder = arguments['<output_dir>']
    if not output_folder.endswith('/'):
        output_folder += '/'
    dicts_folder = output_folder + 'dicts_and_encoded_texts/'
    edges_folder = output_folder + 'edges/'
    graph_folder = output_folder + 'graph/'

    util.mkdir_p(output_folder)
    util.mkdir_p(dicts_folder)
    util.mkdir_p(edges_folder)
    util.mkdir_p(graph_folder)

    max_window_size = int(arguments['--max_window_size'])
    process_num = int(arguments['--process_num'])
    min_count = int(arguments['--min_count'])
    max_vocab_size = int(arguments['--max_vocab_size'])
    safe_files_number_per_processor = int(
        arguments['--safe_files_number_per_processor'])

    if arguments['all']:
        start_time = time.time()
        wp = WordProcessing(output_folder=dicts_folder,
                            word_tokenizer='',
                            wtokenizer=Tokenizer.mytok,
                            remove_numbers=False,
                            remove_punctuations=False,
                            stem_word=False,
                            lowercase=False)
        merged_dict = wp.apply(data_folder=data_folder,
                               process_num=process_num)
        sp = SentenceProcessing(dicts_folder=dicts_folder,
                                output_folder=edges_folder,
                                max_window_size=max_window_size,
                                local_dict_extension='.dicloc')
        word_count_all = sp.apply(data_folder=dicts_folder,
                                  process_num=process_num)
        wpp = WordPairsProcessing(
            max_vocab_size=max_vocab_size,
            min_count=min_count,
            dicts_folder=dicts_folder,
            window_size=max_window_size,
            edges_folder=edges_folder,
            graph_folder=graph_folder,
            safe_files_number_per_processor=safe_files_number_per_processor)
        result = wpp.apply(process_num=process_num)
        # wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4)
        print('time in seconds:', util.count_time(start_time))

    if arguments['wordprocessing']:
        start_time = time.time()
        wp = WordProcessing(output_folder=dicts_folder,
                            word_tokenizer='',
                            wtokenizer=Tokenizer.mytok,
                            remove_numbers=False,
                            remove_punctuations=False,
                            stem_word=False,
                            lowercase=False)
        merged_dict = wp.apply(data_folder=data_folder,
                               process_num=process_num)
        print('time for word processing in seconds:',
              util.count_time(start_time))

    if arguments['sentenceprocessing']:
        start_time = time.time()
        sp = SentenceProcessing(dicts_folder=dicts_folder,
                                output_folder=edges_folder,
                                max_window_size=max_window_size,
                                local_dict_extension='.dicloc')
        word_count_all = sp.apply(data_folder=dicts_folder,
                                  process_num=process_num)
        print('time for sentence processing in seconds:',
              util.count_time(start_time))

    if arguments['wordpairsprocessing']:
        start_time = time.time()
        wpp = WordPairsProcessing(
            max_vocab_size=max_vocab_size,
            min_count=min_count,
            dicts_folder=dicts_folder,
            window_size=max_window_size,
            edges_folder=edges_folder,
            graph_folder=graph_folder,
            safe_files_number_per_processor=safe_files_number_per_processor)
        result = wpp.apply(process_num=process_num)
        # wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4)
        print('time for word pairs processing in seconds:',
              util.count_time(start_time))
Exemplo n.º 4
0
#                         max_window_size=max_window_size, local_dict_extension=config['graph']['local_dict_extension'])
# word_count_all = sp.apply(data_folder=dicts_folder, process_num=process_num)
# print('time in seconds:', util.count_time(start_time))

start_time = time.time()
wpp = WordPairsProcessing(max_vocab_size=max_vocab_size,
                          min_count=min_count,
                          dicts_folder=dicts_folder,
                          window_size=max_window_size,
                          edges_folder=edges_folder,
                          graph_folder=graph_folder,
                          safe_files_number_per_processor=config['graph']
                          ['safe_files_number_per_processor'])
result = wpp.apply(process_num=process_num)
# wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4)
print('time in seconds:', util.count_time(start_time))

# # convert edges for undirected graph
# wpp.convert_encoded_edges_count_for_undirected_graph(
#     old_encoded_edges_count_path=graph_folder + 'encoded_edges_count_window_size_5.txt')

# from corpus2graph.applications import graph_builder as gb
#
# # load into NoGraph and calculate stochastic matrix
# start_time = time.time()
# no_graph = gb.NoGraph(graph_folder + 'encoded_edges_count_window_size_5_undirected.txt',
#                       valid_vocabulary_path=dicts_folder + 'valid_vocabulary_min_count_5_vocab_size_10000.txt')
# print('[load into NoGraph] time in seconds:', util.count_time(start_time))
# start_time = time.time()
# matrix = no_graph.get_stochastic_matrix(remove_self_loops=False)
# print('[calculate stochastic matrix] time in seconds:', util.count_time(start_time))