def split_sentences(): # API.KMR, API.EUNJEON, API.ARIRANG, API.RHINO, API.DAON, API.OKT, API.KKMA, API.HNN, API.ETRI. splitter = SentenceSplitter(splitter_type=API.HANNANUM) paragraph = splitter('분리할 문장을 이렇게 넣으면 문장이 분리됩니다. 간단하죠?') #splitter.sentences('분리할 문장을 이렇게 넣으면 문장이 분리됩니다. 간단하죠?') #splitter.invoke('분리할 문장을 이렇게 넣으면 문장이 분리됩니다. 간단하죠?') print(paragraph[0]) print(paragraph[1]) #-------------------- tagger = Tagger(API.EUNJEON) # 품사분석기. tagged_sentence = tagger.tagSentence('무엇인가 품사분석을 수행할 문단') paragraph = SentenceSplitter.sentencesTagged(tagged_sentence[0]) # tagged_sentence는 각 인자별로 한 문장으로 간주된 List[Sentence]임. print(paragraph)
def process(self, article): # remove bylines article = re.sub(r'\. *\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article) article = re.sub(r'\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article) # remove parentheses article = re.sub(r'\([^)]+\)', ' ', article) article = re.sub(r'\[[^)]+\]', ' ', article) article = re.sub(r'\<[^)]+\>', ' ', article) article = re.sub(r'\【[^)]+\】', ' ', article) # replace hanja to hangul hanja.translate(article, 'substitution') # remove special characters except necessary punctuations article = re.sub(r'[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣0-9\%\-\_\.\,\?\!\/\"\'ㆍ·。、“”‘’『』《》〈〉「」\~○×□…\ ]', ' ', article) # initialize korean language analyzers splitter = SentenceSplitter(API.HNN) tagger = Tagger(API.HNN) # split text into sentences sentences = splitter(article) # regularize sentences and split into POS article_regularized = '' for sent in sentences: sent = tagger.tagSentence(sent) sent_regularized = [] for word in sent[0].words: sent_regularized.append(' '.join([m.surface for m in word.morphemes])) article_regularized += '\n' + ' '.join(sent_regularized) # regularize whitespaces article_regularized = re.sub(r' +', ' ', article_regularized) command = ["java", "edu.stanford.nlp.process.PTBTokenizer", "-preserveLines", "-lowerCase"] result = '' echo = subprocess.Popen(["echo", "'{}'".format(article_regularized)], stdout=subprocess.PIPE) result = subprocess.check_output(command, stdin=echo.stdout) echo.wait() return result.decode("utf-8")
def __init__(self, embed_model_path=default_embed_model_path, cls_model_path=default_cls_model_path): if not os.path.isfile(embed_model_path): print('embed_model doesn\'t exist.\ncheck the model path') if not os.path.isfile(cls_model_path): print('classifier_model doesn\'t exist.\ncheck the model path') # 전처리 패키지 초기화 print('Init preprocessing') initialize(KMR='2.1.4') #LATEST--> 2.1.4로 변경 self.tagger = Tagger(API.KMR) # 임베딩 모델 로딩 print('Loading Embedding model') self.embedding_model = Doc2Vec.load(embed_model_path) # Classifier 모델 로딩 print('Loading Classifier model') with open(cls_model_path, 'rb') as fp: self.clf_model = pickle.load(fp)
from koalanlp.Util import initialize, finalize from koalanlp.proc import SentenceSplitter, Tagger from koalanlp import API from sentence_split import run_evaluate if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('datasets', nargs='+') parser.add_argument('--write_result') parser.add_argument('--write_err') parser.add_argument( '--backend', default='OKT', choices=['OKT', 'HNN', 'KMR', 'RHINO', 'EUNJEON', 'ARIRANG', 'KKMA']) args = parser.parse_args() initialize(**{args.backend: 'LATEST'}) if args.backend in ('OKT', 'HNN'): splitter = SentenceSplitter(getattr(API, args.backend)) else: tagger = Tagger(getattr(API, args.backend)) splitter = lambda text: [sent.surfaceString() for sent in tagger(text)] splitter("foo-bar") # warm-up for dataset in args.datasets: run_evaluate(dataset, splitter, args.write_result, args.write_err) finalize()
from koalanlp.Util import initialize from koalanlp.proc import Tagger from koalanlp import API from nltk.tokenize import word_tokenize import re initialize(EUNJEON='LATEST') tagger = Tagger(API.EUNJEON) with open('/Users/rjsgm/PycharmProjects/Covefefe_kor/input_folder/a.txt', 'r', encoding='UTF-8') as file_object: contents = file_object.read() filename1 = 'stopwords.txt' with open(filename1, 'r', encoding='UTF-8') as file_object: contents1 = file_object.read() f_pos = open('/Users/rjsgm/PycharmProjects/Covefefe_kor/output_folder/pos.txt', 'w', encoding='UTF-8') f_token = open( '/Users/rjsgm/PycharmProjects/Covefefe_kor/output_folder/token.txt', 'w', encoding='UTF-8') stopwords = contents1.split('\n') word_tokens = word_tokenize(contents) result = ""