-
Notifications
You must be signed in to change notification settings - Fork 0
/
korean_text_normalizer.py
54 lines (36 loc) · 1.33 KB
/
korean_text_normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pdb
import re
import datetime as dt
from konlpy.tag import Hannanum
from sklearn.base import BaseEstimator, TransformerMixin
from es_corpus_reader import EsCorpusReader
class KoreanTextNormalizer(BaseEstimator, TransformerMixin):
def __init__(self) -> None:
super().__init__()
self.hannanum = Hannanum()
def fit(self, X, y=None):
return self
def transform(self, docs):
def generator():
for doc in docs:
doc = re.sub(r'[^\wㄱ-ㅎ가-힣美中&%]', ' ', doc)
# token = self.hannanum.morphs(doc) # Normalizer 문제로 나중에 조사들이 keyword로 뽑히게 됨..
token = self.hannanum.pos(doc)
token = [x[0] for x in token if x[1] not in ['E', 'J']] # 조사와 어미 버리기!
yield token
return list(generator())
if __name__ == "__main__":
reader = EsCorpusReader(
date_from=dt.datetime(2021,5,10),
date_to=dt.datetime(2021,5,11)
)
# for doc in reader.titles(n=10):
# print(doc)
print('Loop #1')
corpus = list(reader.titles(n=10))
print('Loop #2')
normalizer = KoreanTextNormalizer()
normalized = normalizer.fit_transform(corpus)
for idx, x in enumerate(normalized):
print(f'{idx} : {x}')
print('hello world')