def __init__(self, n_results=100, document_from='2020-01-01', document_end='2021-03-31', target_from='2021-04-01', target_end='2021-04-15'): self.magazine_db = MagazineDB() self.okt = Okt() self.text_cleaner = TextCleaner() self.total_news = self.magazine_db.get_news(from_date=document_from, end_date=document_end) self.target_news = self.magazine_db.get_news(from_date=target_from, end_date=target_end) self.inverse_dict: INVERSE_DICT = defaultdict(set) self.idf_dict = {} self.tf_dict_per_id: DefaultDict[DOC_ID, DefaultDict[ WORD, TF]] = defaultdict(lambda: defaultdict(int)) self.inverse_dict_target: INVERSE_DICT = defaultdict(set) self.tf_dict: DefaultDict[WORD, TF] = defaultdict(int) self.tf_idf_dict = {} self.n_results = n_results
class BaseNewsTfIdf(BaseTfIdf): def __init__(self, document_from='2020-01-01', document_end='2021-03-31', data_root_path='data', n_results=300, *args, **kwargs): super().__init__(*args, **kwargs) self.text_cleaner = TextCleaner() self.magazine_db = MagazineDB() os.makedirs(data_root_path, exist_ok=True) self.data_root_path = data_root_path self.n_results = n_results self.total_news = self.magazine_db.get_news(from_date=document_from, end_date=document_end) self.all_tf_by_doc: TF_STORE = None self.news_id_idx = 0 self.news_content_idx = 3 def load_data(self, files=[]): for i in ['all_tf_by_doc', *files]: file_name = f"{os.path.join(self.data_root_path, i)}.pkl" if os.path.isfile(file_name): with open(file_name, 'rb') as f: self.__dict__[i] = pickle.load(f) else: self.__dict__[i] = None def get_docs_by_word(self, docs: Iterable[Tuple[DOC_ID, str]], desc): alls = self.all_tf_by_doc alls = alls if alls is not None else defaultdict(self.ddint) tf_by_id = defaultdict(self.ddint) docs_by_word = defaultdict(set) for doc_id, txt in tqdm(docs, desc=desc): if doc_id in alls: for noun, count in alls[doc_id].items(): docs_by_word[noun].add(doc_id) tf_by_id[doc_id][noun] = count else: nouns = self.get_nouns(txt) for noun in self.text_cleaner.get_clean_words(nouns): docs_by_word[noun].add(doc_id) tf_by_id[doc_id][noun] += 1 alls[doc_id][noun] += 1 self.all_tf_by_doc = alls self.save_pickle(path='all_tf_by_doc', target=self.all_tf_by_doc) return docs_by_word, dict(tf_by_id) def get_id_content_from_news(self, news): return map(lambda x: (x[self.news_id_idx], x[self.news_content_idx]), news) def pipeline(): raise NotImplementedError("In Pipeline")
def __init__(self, document_from='2020-01-01', document_end='2021-03-31', data_root_path='data', n_results=300, *args, **kwargs): super().__init__(*args, **kwargs) self.text_cleaner = TextCleaner() self.magazine_db = MagazineDB() os.makedirs(data_root_path, exist_ok=True) self.data_root_path = data_root_path self.n_results = n_results self.total_news = self.magazine_db.get_news(from_date=document_from, end_date=document_end) self.all_tf_by_doc: TF_STORE = None self.news_id_idx = 0 self.news_content_idx = 3
def __init__(self, n_results=300, document_from='2020-01-01', document_end='2021-03-31', target_from='2021-04-01', target_end='2021-04-15'): self.magazine_db = MagazineDB() self.okt = Okt() self.text_cleaner = TextCleaner() self.total_news = self.magazine_db.get_news( from_date=document_from, end_date=document_end) self.target_news = self.magazine_db.get_news( from_date=target_from, end_date=target_end) self.inverse_dict = {} self.idf_dict = {} self.tf_dict_per_id = {} self.inverse_dict_target = {} self.tf_dict = {} self.tf_idf_dict = {} self.n_results = n_results
if isinstance(v, list): keywords += v else: keywords.append(v) keywords.append(str(k)) def get_clean_word(s): return ''.join(filter(str.isalnum, s)).lower() # ================================================================ clean_keywords = set(map(get_clean_word, keywords)) okt = Okt() magazine_db = MagazineDB() total_news = magazine_db.get_news(from_date='2021-04-12', end_date='2021-05-11') not_in_keywords = set() doc_frequency = defaultdict(int) url_dict = defaultdict(list) for id, news_date, news_title, news_content, news_url in tqdm(total_news): nouns = okt.morphs(news_content, norm=True, stem=True) nouns = set(nouns) # for Document Frequency for noun in nouns: if noun in clean_keywords: doc_frequency[noun] += 1 url_dict[noun].append(news_url) else:
class TF_IDF_GENERATOR(): def __init__(self, n_results=300, document_from='2020-01-01', document_end='2021-03-31', target_from='2021-04-01', target_end='2021-04-15'): self.magazine_db = MagazineDB() self.okt = Okt() self.text_cleaner = TextCleaner() self.total_news = self.magazine_db.get_news( from_date=document_from, end_date=document_end) self.target_news = self.magazine_db.get_news( from_date=target_from, end_date=target_end) self.inverse_dict = {} self.idf_dict = {} self.tf_dict_per_id = {} self.inverse_dict_target = {} self.tf_dict = {} self.tf_idf_dict = {} self.n_results = n_results def load_data(self): with open('data/inverse_dict.pkl', 'rb') as f: self.inverse_dict = self.text_cleaner.get_clean_words( pickle.load(f)) # with open('data/idf_dict.pkl', 'rb') as f: # self.idf_dict = pickle.load(f) # with open('data/tf_dict_per_id.pkl', 'rb') as f: # self.tf_dict_per_id = pickle.load(f) # with open('data/inverse_dict_target.pkl', 'rb') as f: # self.inverse_dict_target = pickle.load(f) # with open('data/tf_dict.pkl', 'rb') as f: # self.tf_dict = pickle.load(f) # with open('data/tf_idf_dict.pkl', 'rb') as f: # self.tf_idf_dict = pickle.load(f) def build_inverse_dict(self, save_result=True, save_path='data/inverse_dict.pkl'): for id, news_date, news_title, news_content, news_url in tqdm(self.total_news): # nouns = self.okt.nouns(news_content) # 형태소 추출 nouns = self.okt.morphs(news_content, norm=True, stem=True) for noun in self.text_cleaner.get_clean_words(nouns): if self.inverse_dict.get(noun) is not None: self.inverse_dict[noun].add(id) else: self.inverse_dict[noun] = {id} if save_result: with open(save_path, 'wb') as f: pickle.dump(self.inverse_dict, f) def calculate_idf(self, save_result=True, save_path='data/idf_dict.pkl'): for k, v in self.inverse_dict.items(): self.idf_dict[k] = math.log(len(self.total_news) / (len(v) + 1)) self.idf_dict['__default_value__'] = math.log(len(self.total_news)) if save_result: with open(save_path, 'wb') as f: pickle.dump(self.idf_dict, f) def build_target_dict(self, save_result=True, save_path_tf='data/tf_dict_per_id.pkl', save_path_inv='data/inverse_dict_target.pkl'): for id, news_date, news_title, news_content, news_url in tqdm(self.target_news): # nouns = self.okt.nouns(news_content) nouns = self.okt.morphs(news_content, norm=True, stem=True) for noun in self.text_cleaner.get_clean_words(nouns): if self.tf_dict_per_id.get(id) is None: self.tf_dict_per_id[id] = {} if self.tf_dict_per_id[id].get(noun) is not None: self.tf_dict_per_id[id][noun] += 1 else: self.tf_dict_per_id[id][noun] = 1 if self.inverse_dict_target.get(noun) is not None: self.inverse_dict_target[noun].add(id) else: self.inverse_dict_target[noun] = {id} if save_result: with open(save_path_tf, 'wb') as f: pickle.dump(self.tf_dict_per_id, f) with open(save_path_inv, 'wb') as f: pickle.dump(self.inverse_dict_target, f) def aggregate_tf_dict(self, save_result=True, save_path='data/tf_dict.pkl'): for v in self.tf_dict_per_id.values(): for k in v.keys(): if self.tf_dict.get(k) is not None: self.tf_dict[k] += v[k] else: self.tf_dict[k] = v[k] if save_result: with open(save_path, 'wb') as f: pickle.dump(self.tf_dict, f) def build_tf_idf_dict(self, save_result=True, save_path='data/tf_idf_dict.pkl'): for k, v in self.tf_dict.items(): idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__']) tf_val = math.log(1 + v) # self.tf_idf_dict[k] = tf_val * idf_val self.tf_idf_dict[k] = tf_val if idf_val > 3.0 else 0 if save_result: with open(save_path, 'wb') as f: pickle.dump(self.tf_idf_dict, f) def get_keywords_from_news(self, news_id): tf_dict = self.tf_dict_per_id[news_id] tf_idf_dict = {} for k, v in tf_dict.items(): idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__']) tf_val = math.log(1 + v) tf_idf_dict[k] = tf_val * idf_val sorted_tf_idf = sorted(tf_idf_dict.items(), key=lambda x: x[1], reverse=True) keywords = [x[0] for x in sorted_tf_idf[:5]] return keywords def build_final_result(self, save_result=True, save_path='data/final_results.pkl', n_results=100): print("n_results: ", n_results) tf_idf_sorted = sorted(self.tf_idf_dict.items(), key=lambda x: x[1], reverse=True) tf_idf_sorted = tf_idf_sorted[:n_results] keyword_id_dict = {} final_results = {} news_dict = {} id_list = [] for keyword, tf_idf_val in tf_idf_sorted: keyword_id_dict[keyword] = self.inverse_dict_target[keyword] id_list = id_list + list(keyword_id_dict[keyword]) news_list = self.magazine_db.get_news_by_id(id_list) for id, news_date, news_title, news_content, news_url in news_list: news_dict[id] = (news_date, news_title, news_content, news_url) for keyword, ids in keyword_id_dict.items(): final_results[keyword] = [] for each_id in ids: final_results[keyword].append( news_dict[each_id] + (self.tf_dict_per_id[each_id][keyword],)) final_results[keyword].sort(key=lambda x: x[4], reverse=True) final_results[keyword] = final_results[keyword][:10] if save_result: with open(save_path, 'wb') as f: pickle.dump(final_results, f) def pipeline(self): self.load_data() # self.build_inverse_dict(save_result=True, save_path='data/inverse_dict.pkl') self.calculate_idf(save_result=True, save_path='data/idf_dict.pkl') self.build_target_dict(save_result=True, save_path_tf='data/tf_dict_per_id.pkl', save_path_inv='data/inverse_dict_target.pkl') self.aggregate_tf_dict(save_result=True, save_path='data/tf_dict.pkl') self.build_tf_idf_dict( save_result=True, save_path='data/tf_idf_dict.pkl') self.build_final_result( save_result=True, save_path='data/final_results.pkl', n_results=self.n_results) def test_result(self): with open('data/final_results.pkl', 'rb') as f: dat = pickle.load(f) for k, v in dat.items(): print('\n\n---------------------------------------') print(k) print('\n') for i in v: print('\t', i[0]) print('\t', i[1]) print('\t', i[3]) print('\t', i[4]) print('---------------------------------------') input() def write_csv(self): with open('data/final_results.pkl', 'rb') as f: dat = pickle.load(f) with open('data/final_results.csv', 'w', encoding='utf-8-sig') as f: writer = csv.writer(f) for k, v in dat.items(): for i in v: writer.writerow([k, i[0], i[1], i[3], i[4]])
class TF_IDF_GENERATOR(): def __init__(self, n_results=100, document_from='2020-01-01', document_end='2021-03-31', target_from='2021-04-01', target_end='2021-04-15'): self.magazine_db = MagazineDB() self.okt = Okt() self.text_cleaner = TextCleaner() self.total_news = self.magazine_db.get_news(from_date=document_from, end_date=document_end) self.target_news = self.magazine_db.get_news(from_date=target_from, end_date=target_end) self.inverse_dict: INVERSE_DICT = defaultdict(set) self.idf_dict = {} self.tf_dict_per_id: DefaultDict[DOC_ID, DefaultDict[ WORD, TF]] = defaultdict(lambda: defaultdict(int)) self.inverse_dict_target: INVERSE_DICT = defaultdict(set) self.tf_dict: DefaultDict[WORD, TF] = defaultdict(int) self.tf_idf_dict = {} self.n_results = n_results def load_data(self): with open('data/inverse_dict.pkl', 'rb') as f: self.inverse_dict = self.text_cleaner.get_clean_words( pickle.load(f)) # with open('data/idf_dict.pkl', 'rb') as f: # self.idf_dict = pickle.load(f) # with open('data/tf_dict_per_id.pkl', 'rb') as f: # self.tf_dict_per_id = pickle.load(f) # with open('data/inverse_dict_target.pkl', 'rb') as f: # self.inverse_dict_target = pickle.load(f) # with open('data/tf_dict.pkl', 'rb') as f: # self.tf_dict = pickle.load(f) # with open('data/tf_idf_dict.pkl', 'rb') as f: # self.tf_idf_dict = pickle.load(f) def build_inverse_dict(self, save_result=True, save_path='data/inverse_dict.pkl'): for id, news_date, news_title, news_content, news_url in tqdm( self.total_news, desc="Build Total News"): # nouns = self.okt.nouns(news_content) nouns = self.okt.morphs(news_content, norm=True, stem=True) for noun in self.text_cleaner.get_clean_words(nouns): self.inverse_dict[noun].add(id) if save_result: with open(save_path, 'wb') as f: pickle.dump(self.inverse_dict, f) def calculate_idf(self, save_result=True, save_path='data/idf_dict.pkl'): for k, v in self.inverse_dict.items(): self.idf_dict[k] = math.log(len(self.total_news) / (len(v) + 1)) self.idf_dict['__default_value__'] = math.log(len(self.total_news)) if save_result: with open(save_path, 'wb') as f: pickle.dump(self.idf_dict, f) def build_target_dict(self, save_result=True, save_path_tf='data/tf_dict_per_id.pkl', save_path_inv='data/inverse_dict_target.pkl'): for id, news_date, news_title, news_content, news_url in tqdm( self.target_news, desc="Build Target News"): # nouns = self.okt.nouns(news_content) nouns = self.okt.morphs(news_content, norm=True, stem=True) for noun in self.text_cleaner.get_clean_words(nouns): self.tf_dict_per_id[id][noun] += 1 self.inverse_dict_target[noun].add(id) if save_result: with open(save_path_tf, 'wb') as f: pickle.dump(dict(self.tf_dict_per_id), f) with open(save_path_inv, 'wb') as f: pickle.dump(self.inverse_dict_target, f) def aggregate_tf_dict(self, save_result=True, save_path='data/tf_dict.pkl'): for v in self.tf_dict_per_id.values(): for k in v.keys(): self.tf_dict[k] += v[k] if save_result: with open(save_path, 'wb') as f: pickle.dump(self.tf_dict, f) def build_tf_idf_dict(self, save_result=True, save_path='data/tf_idf_dict.pkl'): for k, v in self.tf_dict.items(): idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__']) tf_val = math.log(1 + v) self.tf_idf_dict[k] = tf_val * idf_val # self.tf_idf_dict[k] = tf_val if idf_val > 3.0 else 0 if save_result: with open(save_path, 'wb') as f: pickle.dump(self.tf_idf_dict, f) def get_keywords_from_news(self, news_id): tf_dict = self.tf_dict_per_id[news_id] tf_idf_dict = {} for k, v in tf_dict.items(): idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__']) tf_val = math.log(1 + v) tf_idf_dict[k] = tf_val * idf_val sorted_tf_idf = sorted(tf_idf_dict.items(), key=lambda x: x[1], reverse=True) keywords = [x[0] for x in sorted_tf_idf[:5]] return keywords def build_final_result(self, save_result=True, save_path='data/final_results_default.pkl', n_results=10): tf_idf_sorted = sorted(self.tf_idf_dict.items(), key=lambda x: x[1], reverse=True) tf_idf_sorted = tf_idf_sorted[:n_results] keyword_id_dict: Dict[WORD, Set] = {} final_results = defaultdict(list) news_dict = {} id_list = [] for keyword, tf_idf_val in tf_idf_sorted: # IDF 명사키에 idset을 value keyword_id_dict[keyword] = self.inverse_dict_target[keyword] id_list += list(keyword_id_dict[keyword]) # DF news_list = self.magazine_db.get_news_by_id(id_list) for id, news_date, news_title, news_content, news_url in news_list: # 해당 뉴스에서 영향력이 높은 키워드들 news_keywords = self.get_keywords_from_news(id) news_dict[id] = (news_date, news_title, news_content, news_url, news_keywords) for keyword, ids in keyword_id_dict.items(): for each_id in ids: final_results[keyword].append(news_dict[each_id]) if save_result: with open(save_path, 'wb') as f: pickle.dump(final_results, f) def pipeline(self): self.load_data() # self.build_inverse_dict(save_result=True, save_path='data/inverse_dict.pkl') self.calculate_idf(save_result=True, save_path='data/idf_dict.pkl') self.build_target_dict(save_result=True, save_path_tf='data/tf_dict_per_id.pkl', save_path_inv='data/inverse_dict_target.pkl') self.aggregate_tf_dict(save_result=True, save_path='data/tf_dict.pkl') self.build_tf_idf_dict(save_result=True, save_path='data/tf_idf_dict.pkl') self.build_final_result(save_result=True, save_path='data/final_results_default.pkl', n_results=self.n_results) def test_result(self): with open('data/final_results_default.pkl', 'rb') as f: dat = pickle.load(f) for k, v in dat.items(): print('\n\n---------------------------------------') print(k) print('\n') for i in v: print('\t', i[0]) print('\t', i[1]) print('\t', i[3]) print('\t', i[4]) print('---------------------------------------') input() def write_csv(self): with open('data/final_results_default.pkl', 'rb') as f: dat = pickle.load(f) with open('data/final_results_default.csv', 'w', encoding='utf-8-sig') as f: writer = csv.writer(f) for k, v in dat.items(): for i in v: writer.writerow([k, i[0], i[1], i[3], i[4]])