예제 #1
0
파일: main.py 프로젝트: seongpil0948/tf-idf
    def __init__(self,
                 n_results=100,
                 document_from='2020-01-01',
                 document_end='2021-03-31',
                 target_from='2021-04-01',
                 target_end='2021-04-15'):
        self.magazine_db = MagazineDB()
        self.okt = Okt()
        self.text_cleaner = TextCleaner()

        self.total_news = self.magazine_db.get_news(from_date=document_from,
                                                    end_date=document_end)
        self.target_news = self.magazine_db.get_news(from_date=target_from,
                                                     end_date=target_end)

        self.inverse_dict: INVERSE_DICT = defaultdict(set)
        self.idf_dict = {}

        self.tf_dict_per_id: DefaultDict[DOC_ID, DefaultDict[
            WORD, TF]] = defaultdict(lambda: defaultdict(int))
        self.inverse_dict_target: INVERSE_DICT = defaultdict(set)

        self.tf_dict: DefaultDict[WORD, TF] = defaultdict(int)
        self.tf_idf_dict = {}
        self.n_results = n_results
예제 #2
0
class BaseNewsTfIdf(BaseTfIdf):
    def __init__(self,
                 document_from='2020-01-01',
                 document_end='2021-03-31',
                 data_root_path='data',
                 n_results=300,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.text_cleaner = TextCleaner()
        self.magazine_db = MagazineDB()
        os.makedirs(data_root_path, exist_ok=True)
        self.data_root_path = data_root_path
        self.n_results = n_results
        self.total_news = self.magazine_db.get_news(from_date=document_from,
                                                    end_date=document_end)
        self.all_tf_by_doc: TF_STORE = None
        self.news_id_idx = 0
        self.news_content_idx = 3

    def load_data(self, files=[]):
        for i in ['all_tf_by_doc', *files]:
            file_name = f"{os.path.join(self.data_root_path, i)}.pkl"
            if os.path.isfile(file_name):
                with open(file_name, 'rb') as f:
                    self.__dict__[i] = pickle.load(f)
            else:
                self.__dict__[i] = None

    def get_docs_by_word(self, docs: Iterable[Tuple[DOC_ID, str]], desc):
        alls = self.all_tf_by_doc
        alls = alls if alls is not None else defaultdict(self.ddint)
        tf_by_id = defaultdict(self.ddint)
        docs_by_word = defaultdict(set)

        for doc_id, txt in tqdm(docs, desc=desc):
            if doc_id in alls:
                for noun, count in alls[doc_id].items():
                    docs_by_word[noun].add(doc_id)
                    tf_by_id[doc_id][noun] = count
            else:
                nouns = self.get_nouns(txt)
                for noun in self.text_cleaner.get_clean_words(nouns):
                    docs_by_word[noun].add(doc_id)
                    tf_by_id[doc_id][noun] += 1
                    alls[doc_id][noun] += 1

        self.all_tf_by_doc = alls
        self.save_pickle(path='all_tf_by_doc', target=self.all_tf_by_doc)

        return docs_by_word, dict(tf_by_id)

    def get_id_content_from_news(self, news):
        return map(lambda x: (x[self.news_id_idx], x[self.news_content_idx]),
                   news)

    def pipeline():
        raise NotImplementedError("In Pipeline")
예제 #3
0
 def __init__(self,
              document_from='2020-01-01',
              document_end='2021-03-31',
              data_root_path='data',
              n_results=300,
              *args,
              **kwargs):
     super().__init__(*args, **kwargs)
     self.text_cleaner = TextCleaner()
     self.magazine_db = MagazineDB()
     os.makedirs(data_root_path, exist_ok=True)
     self.data_root_path = data_root_path
     self.n_results = n_results
     self.total_news = self.magazine_db.get_news(from_date=document_from,
                                                 end_date=document_end)
     self.all_tf_by_doc: TF_STORE = None
     self.news_id_idx = 0
     self.news_content_idx = 3
예제 #4
0
    def __init__(self, n_results=300, document_from='2020-01-01', document_end='2021-03-31', target_from='2021-04-01', target_end='2021-04-15'):
        self.magazine_db = MagazineDB()
        self.okt = Okt()
        self.text_cleaner = TextCleaner()

        self.total_news = self.magazine_db.get_news(
            from_date=document_from, end_date=document_end)
        self.target_news = self.magazine_db.get_news(
            from_date=target_from, end_date=target_end)

        self.inverse_dict = {}
        self.idf_dict = {}

        self.tf_dict_per_id = {}
        self.inverse_dict_target = {}

        self.tf_dict = {}
        self.tf_idf_dict = {}
        self.n_results = n_results
예제 #5
0
        if isinstance(v, list):
            keywords += v
        else:
            keywords.append(v)
        keywords.append(str(k))


def get_clean_word(s):
    return ''.join(filter(str.isalnum, s)).lower()


# ================================================================

clean_keywords = set(map(get_clean_word, keywords))
okt = Okt()
magazine_db = MagazineDB()
total_news = magazine_db.get_news(from_date='2021-04-12',
                                  end_date='2021-05-11')

not_in_keywords = set()
doc_frequency = defaultdict(int)
url_dict = defaultdict(list)

for id, news_date, news_title, news_content, news_url in tqdm(total_news):
    nouns = okt.morphs(news_content, norm=True, stem=True)
    nouns = set(nouns)  # for Document Frequency
    for noun in nouns:
        if noun in clean_keywords:
            doc_frequency[noun] += 1
            url_dict[noun].append(news_url)
        else:
예제 #6
0
class TF_IDF_GENERATOR():
    def __init__(self, n_results=300, document_from='2020-01-01', document_end='2021-03-31', target_from='2021-04-01', target_end='2021-04-15'):
        self.magazine_db = MagazineDB()
        self.okt = Okt()
        self.text_cleaner = TextCleaner()

        self.total_news = self.magazine_db.get_news(
            from_date=document_from, end_date=document_end)
        self.target_news = self.magazine_db.get_news(
            from_date=target_from, end_date=target_end)

        self.inverse_dict = {}
        self.idf_dict = {}

        self.tf_dict_per_id = {}
        self.inverse_dict_target = {}

        self.tf_dict = {}
        self.tf_idf_dict = {}
        self.n_results = n_results

    def load_data(self):
        with open('data/inverse_dict.pkl', 'rb') as f:
            self.inverse_dict = self.text_cleaner.get_clean_words(
                pickle.load(f))
        # with open('data/idf_dict.pkl', 'rb') as f:
        #     self.idf_dict = pickle.load(f)
        # with open('data/tf_dict_per_id.pkl', 'rb') as f:
        #     self.tf_dict_per_id = pickle.load(f)
        # with open('data/inverse_dict_target.pkl', 'rb') as f:
        #     self.inverse_dict_target = pickle.load(f)
        # with open('data/tf_dict.pkl', 'rb') as f:
        #     self.tf_dict = pickle.load(f)
        # with open('data/tf_idf_dict.pkl', 'rb') as f:
        #     self.tf_idf_dict = pickle.load(f)

    def build_inverse_dict(self, save_result=True, save_path='data/inverse_dict.pkl'):
        for id, news_date, news_title, news_content, news_url in tqdm(self.total_news):
            # nouns = self.okt.nouns(news_content)
            # 형태소 추출
            nouns = self.okt.morphs(news_content, norm=True, stem=True)
            for noun in self.text_cleaner.get_clean_words(nouns):
                if self.inverse_dict.get(noun) is not None:
                    self.inverse_dict[noun].add(id)
                else:
                    self.inverse_dict[noun] = {id}

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.inverse_dict, f)

    def calculate_idf(self, save_result=True, save_path='data/idf_dict.pkl'):
        for k, v in self.inverse_dict.items():
            self.idf_dict[k] = math.log(len(self.total_news) / (len(v) + 1))
            self.idf_dict['__default_value__'] = math.log(len(self.total_news))

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.idf_dict, f)

    def build_target_dict(self, save_result=True, save_path_tf='data/tf_dict_per_id.pkl', save_path_inv='data/inverse_dict_target.pkl'):
        for id, news_date, news_title, news_content, news_url in tqdm(self.target_news):
            # nouns = self.okt.nouns(news_content)
            nouns = self.okt.morphs(news_content, norm=True, stem=True)
            for noun in self.text_cleaner.get_clean_words(nouns):
                if self.tf_dict_per_id.get(id) is None:
                    self.tf_dict_per_id[id] = {}

                if self.tf_dict_per_id[id].get(noun) is not None:
                    self.tf_dict_per_id[id][noun] += 1
                else:
                    self.tf_dict_per_id[id][noun] = 1

                if self.inverse_dict_target.get(noun) is not None:
                    self.inverse_dict_target[noun].add(id)
                else:
                    self.inverse_dict_target[noun] = {id}

        if save_result:
            with open(save_path_tf, 'wb') as f:
                pickle.dump(self.tf_dict_per_id, f)
            with open(save_path_inv, 'wb') as f:
                pickle.dump(self.inverse_dict_target, f)

    def aggregate_tf_dict(self, save_result=True, save_path='data/tf_dict.pkl'):
        for v in self.tf_dict_per_id.values():
            for k in v.keys():
                if self.tf_dict.get(k) is not None:
                    self.tf_dict[k] += v[k]
                else:
                    self.tf_dict[k] = v[k]

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.tf_dict, f)

    def build_tf_idf_dict(self, save_result=True, save_path='data/tf_idf_dict.pkl'):
        for k, v in self.tf_dict.items():
            idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__'])
            tf_val = math.log(1 + v)
            # self.tf_idf_dict[k] = tf_val * idf_val
            self.tf_idf_dict[k] = tf_val if idf_val > 3.0 else 0

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.tf_idf_dict, f)

    def get_keywords_from_news(self, news_id):
        tf_dict = self.tf_dict_per_id[news_id]
        tf_idf_dict = {}
        for k, v in tf_dict.items():
            idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__'])
            tf_val = math.log(1 + v)
            tf_idf_dict[k] = tf_val * idf_val

        sorted_tf_idf = sorted(tf_idf_dict.items(),
                               key=lambda x: x[1], reverse=True)
        keywords = [x[0] for x in sorted_tf_idf[:5]]

        return keywords

    def build_final_result(self, save_result=True, save_path='data/final_results.pkl', n_results=100):
        print("n_results: ", n_results)
        tf_idf_sorted = sorted(self.tf_idf_dict.items(),
                               key=lambda x: x[1], reverse=True)
        tf_idf_sorted = tf_idf_sorted[:n_results]
        keyword_id_dict = {}
        final_results = {}
        news_dict = {}
        id_list = []
        for keyword, tf_idf_val in tf_idf_sorted:
            keyword_id_dict[keyword] = self.inverse_dict_target[keyword]
            id_list = id_list + list(keyword_id_dict[keyword])

        news_list = self.magazine_db.get_news_by_id(id_list)
        for id, news_date, news_title, news_content, news_url in news_list:
            news_dict[id] = (news_date, news_title, news_content, news_url)

        for keyword, ids in keyword_id_dict.items():
            final_results[keyword] = []
            for each_id in ids:
                final_results[keyword].append(
                    news_dict[each_id] + (self.tf_dict_per_id[each_id][keyword],))
            final_results[keyword].sort(key=lambda x: x[4], reverse=True)
            final_results[keyword] = final_results[keyword][:10]

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(final_results, f)

    def pipeline(self):
        self.load_data()
        # self.build_inverse_dict(save_result=True, save_path='data/inverse_dict.pkl')
        self.calculate_idf(save_result=True, save_path='data/idf_dict.pkl')
        self.build_target_dict(save_result=True, save_path_tf='data/tf_dict_per_id.pkl',
                               save_path_inv='data/inverse_dict_target.pkl')
        self.aggregate_tf_dict(save_result=True, save_path='data/tf_dict.pkl')
        self.build_tf_idf_dict(
            save_result=True, save_path='data/tf_idf_dict.pkl')
        self.build_final_result(
            save_result=True, save_path='data/final_results.pkl', n_results=self.n_results)

    def test_result(self):
        with open('data/final_results.pkl', 'rb') as f:
            dat = pickle.load(f)

        for k, v in dat.items():
            print('\n\n---------------------------------------')
            print(k)
            print('\n')
            for i in v:
                print('\t', i[0])
                print('\t', i[1])
                print('\t', i[3])
                print('\t', i[4])
            print('---------------------------------------')

            input()

    def write_csv(self):
        with open('data/final_results.pkl', 'rb') as f:
            dat = pickle.load(f)

        with open('data/final_results.csv', 'w', encoding='utf-8-sig') as f:
            writer = csv.writer(f)
            for k, v in dat.items():
                for i in v:
                    writer.writerow([k, i[0], i[1], i[3], i[4]])
예제 #7
0
파일: main.py 프로젝트: seongpil0948/tf-idf
class TF_IDF_GENERATOR():
    def __init__(self,
                 n_results=100,
                 document_from='2020-01-01',
                 document_end='2021-03-31',
                 target_from='2021-04-01',
                 target_end='2021-04-15'):
        self.magazine_db = MagazineDB()
        self.okt = Okt()
        self.text_cleaner = TextCleaner()

        self.total_news = self.magazine_db.get_news(from_date=document_from,
                                                    end_date=document_end)
        self.target_news = self.magazine_db.get_news(from_date=target_from,
                                                     end_date=target_end)

        self.inverse_dict: INVERSE_DICT = defaultdict(set)
        self.idf_dict = {}

        self.tf_dict_per_id: DefaultDict[DOC_ID, DefaultDict[
            WORD, TF]] = defaultdict(lambda: defaultdict(int))
        self.inverse_dict_target: INVERSE_DICT = defaultdict(set)

        self.tf_dict: DefaultDict[WORD, TF] = defaultdict(int)
        self.tf_idf_dict = {}
        self.n_results = n_results

    def load_data(self):
        with open('data/inverse_dict.pkl', 'rb') as f:
            self.inverse_dict = self.text_cleaner.get_clean_words(
                pickle.load(f))
        # with open('data/idf_dict.pkl', 'rb') as f:
        #     self.idf_dict = pickle.load(f)
        # with open('data/tf_dict_per_id.pkl', 'rb') as f:
        #     self.tf_dict_per_id = pickle.load(f)
        # with open('data/inverse_dict_target.pkl', 'rb') as f:
        #     self.inverse_dict_target = pickle.load(f)
        # with open('data/tf_dict.pkl', 'rb') as f:
        #     self.tf_dict = pickle.load(f)
        # with open('data/tf_idf_dict.pkl', 'rb') as f:
        #     self.tf_idf_dict = pickle.load(f)

    def build_inverse_dict(self,
                           save_result=True,
                           save_path='data/inverse_dict.pkl'):
        for id, news_date, news_title, news_content, news_url in tqdm(
                self.total_news, desc="Build Total News"):
            # nouns = self.okt.nouns(news_content)
            nouns = self.okt.morphs(news_content, norm=True, stem=True)
            for noun in self.text_cleaner.get_clean_words(nouns):
                self.inverse_dict[noun].add(id)

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.inverse_dict, f)

    def calculate_idf(self, save_result=True, save_path='data/idf_dict.pkl'):
        for k, v in self.inverse_dict.items():
            self.idf_dict[k] = math.log(len(self.total_news) / (len(v) + 1))
            self.idf_dict['__default_value__'] = math.log(len(self.total_news))

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.idf_dict, f)

    def build_target_dict(self,
                          save_result=True,
                          save_path_tf='data/tf_dict_per_id.pkl',
                          save_path_inv='data/inverse_dict_target.pkl'):
        for id, news_date, news_title, news_content, news_url in tqdm(
                self.target_news, desc="Build Target News"):
            # nouns = self.okt.nouns(news_content)
            nouns = self.okt.morphs(news_content, norm=True, stem=True)
            for noun in self.text_cleaner.get_clean_words(nouns):
                self.tf_dict_per_id[id][noun] += 1
                self.inverse_dict_target[noun].add(id)

        if save_result:
            with open(save_path_tf, 'wb') as f:
                pickle.dump(dict(self.tf_dict_per_id), f)
            with open(save_path_inv, 'wb') as f:
                pickle.dump(self.inverse_dict_target, f)

    def aggregate_tf_dict(self,
                          save_result=True,
                          save_path='data/tf_dict.pkl'):
        for v in self.tf_dict_per_id.values():
            for k in v.keys():
                self.tf_dict[k] += v[k]

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.tf_dict, f)

    def build_tf_idf_dict(self,
                          save_result=True,
                          save_path='data/tf_idf_dict.pkl'):
        for k, v in self.tf_dict.items():
            idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__'])
            tf_val = math.log(1 + v)
            self.tf_idf_dict[k] = tf_val * idf_val
            # self.tf_idf_dict[k] = tf_val if idf_val > 3.0 else 0

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(self.tf_idf_dict, f)

    def get_keywords_from_news(self, news_id):
        tf_dict = self.tf_dict_per_id[news_id]
        tf_idf_dict = {}
        for k, v in tf_dict.items():
            idf_val = self.idf_dict.get(k, self.idf_dict['__default_value__'])
            tf_val = math.log(1 + v)
            tf_idf_dict[k] = tf_val * idf_val

        sorted_tf_idf = sorted(tf_idf_dict.items(),
                               key=lambda x: x[1],
                               reverse=True)
        keywords = [x[0] for x in sorted_tf_idf[:5]]

        return keywords

    def build_final_result(self,
                           save_result=True,
                           save_path='data/final_results_default.pkl',
                           n_results=10):
        tf_idf_sorted = sorted(self.tf_idf_dict.items(),
                               key=lambda x: x[1],
                               reverse=True)
        tf_idf_sorted = tf_idf_sorted[:n_results]
        keyword_id_dict: Dict[WORD, Set] = {}
        final_results = defaultdict(list)
        news_dict = {}
        id_list = []
        for keyword, tf_idf_val in tf_idf_sorted:
            # IDF 명사키에 idset을 value
            keyword_id_dict[keyword] = self.inverse_dict_target[keyword]
            id_list += list(keyword_id_dict[keyword])
        # DF
        news_list = self.magazine_db.get_news_by_id(id_list)
        for id, news_date, news_title, news_content, news_url in news_list:
            # 해당 뉴스에서 영향력이 높은 키워드들
            news_keywords = self.get_keywords_from_news(id)
            news_dict[id] = (news_date, news_title, news_content, news_url,
                             news_keywords)

        for keyword, ids in keyword_id_dict.items():
            for each_id in ids:
                final_results[keyword].append(news_dict[each_id])

        if save_result:
            with open(save_path, 'wb') as f:
                pickle.dump(final_results, f)

    def pipeline(self):
        self.load_data()
        # self.build_inverse_dict(save_result=True, save_path='data/inverse_dict.pkl')
        self.calculate_idf(save_result=True, save_path='data/idf_dict.pkl')
        self.build_target_dict(save_result=True,
                               save_path_tf='data/tf_dict_per_id.pkl',
                               save_path_inv='data/inverse_dict_target.pkl')
        self.aggregate_tf_dict(save_result=True, save_path='data/tf_dict.pkl')
        self.build_tf_idf_dict(save_result=True,
                               save_path='data/tf_idf_dict.pkl')
        self.build_final_result(save_result=True,
                                save_path='data/final_results_default.pkl',
                                n_results=self.n_results)

    def test_result(self):
        with open('data/final_results_default.pkl', 'rb') as f:
            dat = pickle.load(f)

        for k, v in dat.items():
            print('\n\n---------------------------------------')
            print(k)
            print('\n')
            for i in v:
                print('\t', i[0])
                print('\t', i[1])
                print('\t', i[3])
                print('\t', i[4])
            print('---------------------------------------')

            input()

    def write_csv(self):
        with open('data/final_results_default.pkl', 'rb') as f:
            dat = pickle.load(f)

        with open('data/final_results_default.csv', 'w',
                  encoding='utf-8-sig') as f:
            writer = csv.writer(f)
            for k, v in dat.items():
                for i in v:
                    writer.writerow([k, i[0], i[1], i[3], i[4]])