def test_graph_tool_wrapper(self): start_time = time.time() wg = wordpair_generator.WordsGenerator(window_size=self.max_window_size, file_parser='txt', xml_node_path=None, word_tokenizer='', wtokenizer=Tokenizer.mytok, remove_numbers=False, remove_punctuations=False, stem_word=False, lowercase=False) gtw = graph_tool_wrapper.GraphToolWrapper('Test') for w1,w2 in wg(self.data_folder+'/tmp_dir/'): gtw.addPairs(w1, w2) print(util.count_time(start_time))
wtokenizer=Tokenizer.mytok, remove_numbers=False, remove_punctuations=False, stem_word=False, lowercase=False) merged_dict = wp.apply(data_folder=data_folder, process_num=process_num) sp = SentenceProcessing( dicts_folder=dicts_folder, output_folder=edges_folder, max_window_size=max_window_size, local_dict_extension=config['graph']['local_dict_extension']) word_count_all = sp.apply(data_folder=dicts_folder, process_num=process_num) wpp = WordPairsProcessing(max_vocab_size=max_vocab_size, min_count=min_count, dicts_folder=dicts_folder, window_size=max_window_size, edges_folder=edges_folder, graph_folder=graph_folder, safe_files_number_per_processor=config['graph'] ['safe_files_number_per_processor']) result = wpp.apply(process_num=process_num) # igt = networkx_wrapper.IGraphWrapper('Test') # igt.add_edges_from_file(path=graph_folder+'encoded_edges_count_window_size_5.txt') print('[corpus2graph] time in seconds:', util.count_time(start_time)) gtw = graph_tool_wrapper.GraphToolWrapper('Test') gtw.addEdgesFromFile(path=graph_folder + 'encoded_edges_count_window_size_5.txt') print('[corpus2graph] time in seconds:', util.count_time(start_time))
def main(): arguments = docopt(__doc__, version='1.O.O') data_folder = arguments['<data_dir>'] if not data_folder.endswith('/'): data_folder += '/' output_folder = arguments['<output_dir>'] if not output_folder.endswith('/'): output_folder += '/' dicts_folder = output_folder + 'dicts_and_encoded_texts/' edges_folder = output_folder + 'edges/' graph_folder = output_folder + 'graph/' util.mkdir_p(output_folder) util.mkdir_p(dicts_folder) util.mkdir_p(edges_folder) util.mkdir_p(graph_folder) max_window_size = int(arguments['--max_window_size']) process_num = int(arguments['--process_num']) min_count = int(arguments['--min_count']) max_vocab_size = int(arguments['--max_vocab_size']) safe_files_number_per_processor = int( arguments['--safe_files_number_per_processor']) if arguments['all']: start_time = time.time() wp = WordProcessing(output_folder=dicts_folder, word_tokenizer='', wtokenizer=Tokenizer.mytok, remove_numbers=False, remove_punctuations=False, stem_word=False, lowercase=False) merged_dict = wp.apply(data_folder=data_folder, process_num=process_num) sp = SentenceProcessing(dicts_folder=dicts_folder, output_folder=edges_folder, max_window_size=max_window_size, local_dict_extension='.dicloc') word_count_all = sp.apply(data_folder=dicts_folder, process_num=process_num) wpp = WordPairsProcessing( max_vocab_size=max_vocab_size, min_count=min_count, dicts_folder=dicts_folder, window_size=max_window_size, edges_folder=edges_folder, graph_folder=graph_folder, safe_files_number_per_processor=safe_files_number_per_processor) result = wpp.apply(process_num=process_num) # wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4) print('time in seconds:', util.count_time(start_time)) if arguments['wordprocessing']: start_time = time.time() wp = WordProcessing(output_folder=dicts_folder, word_tokenizer='', wtokenizer=Tokenizer.mytok, remove_numbers=False, remove_punctuations=False, stem_word=False, lowercase=False) merged_dict = wp.apply(data_folder=data_folder, process_num=process_num) print('time for word processing in seconds:', util.count_time(start_time)) if arguments['sentenceprocessing']: start_time = time.time() sp = SentenceProcessing(dicts_folder=dicts_folder, output_folder=edges_folder, max_window_size=max_window_size, local_dict_extension='.dicloc') word_count_all = sp.apply(data_folder=dicts_folder, process_num=process_num) print('time for sentence processing in seconds:', util.count_time(start_time)) if arguments['wordpairsprocessing']: start_time = time.time() wpp = WordPairsProcessing( max_vocab_size=max_vocab_size, min_count=min_count, dicts_folder=dicts_folder, window_size=max_window_size, edges_folder=edges_folder, graph_folder=graph_folder, safe_files_number_per_processor=safe_files_number_per_processor) result = wpp.apply(process_num=process_num) # wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4) print('time for word pairs processing in seconds:', util.count_time(start_time))
# max_window_size=max_window_size, local_dict_extension=config['graph']['local_dict_extension']) # word_count_all = sp.apply(data_folder=dicts_folder, process_num=process_num) # print('time in seconds:', util.count_time(start_time)) start_time = time.time() wpp = WordPairsProcessing(max_vocab_size=max_vocab_size, min_count=min_count, dicts_folder=dicts_folder, window_size=max_window_size, edges_folder=edges_folder, graph_folder=graph_folder, safe_files_number_per_processor=config['graph'] ['safe_files_number_per_processor']) result = wpp.apply(process_num=process_num) # wpp.multiprocessing_merge_edges_count_of_a_specific_window_size(process_num=process_num, already_existed_window_size=4) print('time in seconds:', util.count_time(start_time)) # # convert edges for undirected graph # wpp.convert_encoded_edges_count_for_undirected_graph( # old_encoded_edges_count_path=graph_folder + 'encoded_edges_count_window_size_5.txt') # from corpus2graph.applications import graph_builder as gb # # # load into NoGraph and calculate stochastic matrix # start_time = time.time() # no_graph = gb.NoGraph(graph_folder + 'encoded_edges_count_window_size_5_undirected.txt', # valid_vocabulary_path=dicts_folder + 'valid_vocabulary_min_count_5_vocab_size_10000.txt') # print('[load into NoGraph] time in seconds:', util.count_time(start_time)) # start_time = time.time() # matrix = no_graph.get_stochastic_matrix(remove_self_loops=False) # print('[calculate stochastic matrix] time in seconds:', util.count_time(start_time))