def process_refine(news_route_list): news_refine = [] for news_route in news_route_list: refine_data = refining.refin_new_day(news_route) if refine_data is not None and len(refine_data) > 0: news_refine.append(refine_data) if news_refine is None: print_log('refine result is None. please check refine process or check news directory.') return None return news_refine
def news_process(self): Settings.settings(r'../files/settings.csv') start_read_news = time.time() refine_data = refining.refin_new_day(self.route) refine_data = json.dumps(refine_data) recode_dict = recoding.load_recode_dict(Settings.recoding_dict_route) recoding_data = recoding.replace_all(recode_dict, refine_data) #data = knlp.read_news(newsroute = self.route) self.news = knlp.read_news(newsdata=recoding_data) end_read_news = time.time() log_read_news = 'os{} : {} - read_news() - {}'.format( os.getpid(), self.name, str(end_read_news - start_read_news)) print(log_read_news) # self.news = knlp.read_news(self.route) # end_read_news = time.time() # # log_read_news = 'os{} : {} - read_news() - {}'.format(os.getpid(), # self.name, # str(end_read_news - start_read_news)) # print(log_read_news) start_konlpy = time.time() self.nouns = knlp.get_KoNLP(self.news, Settings.konlp_class, Settings.konlp_function, userDict=Settings.user_dict_route) self.nouns = list(itertools.chain.from_iterable(self.nouns)) end_konlp = time.time() log_nouns = 'os{} : {} - Twitter.nouns() - {}'.format( os.getpid(), self.name, str(end_konlp - start_konlpy)) print(log_nouns) self.write_nouns(Settings.result_route + '_nouns') start_count = time.time() count_data = count.get_unique_count(self.nouns) self.counts = pandas.DataFrame({ 'word': list(count_data.keys()), 'count': list(count_data.values()) }) self.counts = self.counts.sort_index(axis=1, ascending=False) end_count = time.time() log_count = 'os{} : {} - count.get_unique_count() - {}'.format( os.getpid(), self.name, str(end_count - start_count)) print(log_count) start_filter = time.time() self.result = filter.filter_count(self.counts, Settings.filter_route) end_filter = time.time() log_filter = 'os{} : {} - filter.filter_count() - {}'.format( os.getpid(), self.name, str(end_filter - start_filter)) print(log_filter) self.process_log = '\n'.join( [log_read_news, log_nouns, log_count, log_filter]) self.write_result(Settings.result_route)
def news_process(self): # Multiprocessing 이 메모리를 공유하지 않으므로 Settings 다시 로딩 Settings.settings(settings_route) log_list = [] # start start_news = time.time() log_start_news = self.get_news_debug_string('- start news process') print(log_start_news) log_list.append(log_start_news) # refine start_refine_news = time.time() refine_data = refining.refin_new_day(self.route) refine_data = json.dumps(refine_data) end_refine_news = time.time() log_refine_news = self.get_news_debug_string('- refine - {}'.format( str(end_refine_news - start_refine_news))) print(log_refine_news) log_list.append(log_refine_news) # recoding start_recoding_news = time.time() recode_dict = recoding.load_recode_dict(Settings.recoding_dict_route) recoding_data = recoding.replace_all(recode_dict, refine_data) #data = knlp.read_news(newsroute = self.route) self.news = knlp.read_news(newsdata=recoding_data) end_recoding_news = time.time() log_recoding_news = self.get_news_debug_string( '- recoding - {}'.format( str(end_recoding_news - start_recoding_news))) print(log_recoding_news) log_list.append(log_recoding_news) # self.news = knlp.read_news(self.route) # end_read_news = time.time() # # log_read_news = 'os{} : {} - read_news() - {}'.format(os.getpid(), # self.name, # str(end_read_news - start_read_news)) # print(log_read_news) # konlp start_konlpy = time.time() self.nouns = knlp.get_KoNLP(self.news, Settings.konlp_class, Settings.konlp_function, userDict=Settings.user_dict_route) self.nouns = list(itertools.chain.from_iterable(self.nouns)) end_konlp = time.time() log_nouns = self.get_news_debug_string('- cTwitter.nouns - {}'.format( str(end_konlp - start_konlpy))) print(log_nouns) log_list.append(log_nouns) self.write_nouns(Settings.result_route + '_nouns') # count start_count = time.time() count_data = count.get_unique_count(self.nouns) self.counts = pandas.DataFrame({ 'word': list(count_data.keys()), 'count': list(count_data.values()) }) self.counts = self.counts.sort_index(axis=1, ascending=False) end_count = time.time() log_count = self.get_news_debug_string('- count - {}'.format( str(end_count - start_count))) print(log_count) log_list.append(log_count) # filter start_filter = time.time() self.result = filter.filter_count(self.counts, Settings.filter_route) end_filter = time.time() log_filter = self.get_news_debug_string('- filter - {}'.format( str(end_filter - start_filter))) print(log_filter) log_list.append(log_filter) end_news = time.time() log_end_news = self.get_news_debug_string( '- end news process - {}'.format(str(end_news - start_news))) print(log_end_news) log_list.append(log_end_news) # save log self.process_log = ''.join(log_list) # save result self.write_csv(self.result, Settings.result_route, modifier='result')