示例#1
0
def split_sentences():
	# API.KMR, API.EUNJEON, API.ARIRANG, API.RHINO, API.DAON, API.OKT, API.KKMA, API.HNN, API.ETRI.

	splitter = SentenceSplitter(splitter_type=API.HANNANUM)
	paragraph = splitter('분리할 문장을 이렇게 넣으면 문장이 분리됩니다. 간단하죠?')
	#splitter.sentences('분리할 문장을 이렇게 넣으면 문장이 분리됩니다. 간단하죠?')
	#splitter.invoke('분리할 문장을 이렇게 넣으면 문장이 분리됩니다. 간단하죠?')

	print(paragraph[0])
	print(paragraph[1])

	#--------------------
	tagger = Tagger(API.EUNJEON)  # 품사분석기.
	tagged_sentence = tagger.tagSentence('무엇인가 품사분석을 수행할 문단')
	paragraph = SentenceSplitter.sentencesTagged(tagged_sentence[0])  # tagged_sentence는 각 인자별로 한 문장으로 간주된 List[Sentence]임.
	print(paragraph)
示例#2
0
    def process(self, article):
        # remove bylines
        article = re.sub(r'\. *\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article)
        article = re.sub(r'\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article)

        # remove parentheses
        article = re.sub(r'\([^)]+\)', ' ', article)
        article = re.sub(r'\[[^)]+\]', ' ', article)
        article = re.sub(r'\<[^)]+\>', ' ', article)
        article = re.sub(r'\【[^)]+\】', ' ', article)

        # replace hanja to hangul
        hanja.translate(article, 'substitution')

        # remove special characters except necessary punctuations
        article = re.sub(r'[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣0-9\%\-\_\.\,\?\!\/\"\'ㆍ·。、“”‘’『』《》〈〉「」\~○×□…\ ]', ' ', article)

        # initialize korean language analyzers
        splitter = SentenceSplitter(API.HNN)
        tagger = Tagger(API.HNN)

        # split text into sentences
        sentences = splitter(article)

        # regularize sentences and split into POS
        article_regularized = ''
        for sent in sentences:
            sent = tagger.tagSentence(sent)
            sent_regularized = []
            for word in sent[0].words:
                sent_regularized.append(' '.join([m.surface for m in word.morphemes]))
            article_regularized += '\n' + ' '.join(sent_regularized)

        # regularize whitespaces
        article_regularized = re.sub(r' +', ' ', article_regularized)
        command = ["java", "edu.stanford.nlp.process.PTBTokenizer", "-preserveLines", "-lowerCase"]

        result = ''
        echo = subprocess.Popen(["echo", "'{}'".format(article_regularized)], stdout=subprocess.PIPE)
        result = subprocess.check_output(command, stdin=echo.stdout)
        echo.wait()

        return result.decode("utf-8")
    def __init__(self,
                 embed_model_path=default_embed_model_path,
                 cls_model_path=default_cls_model_path):
        if not os.path.isfile(embed_model_path):
            print('embed_model doesn\'t exist.\ncheck the model path')
        if not os.path.isfile(cls_model_path):
            print('classifier_model doesn\'t exist.\ncheck the model path')

        # 전처리 패키지 초기화
        print('Init preprocessing')
        initialize(KMR='2.1.4')  #LATEST--> 2.1.4로 변경
        self.tagger = Tagger(API.KMR)

        # 임베딩 모델 로딩
        print('Loading Embedding model')
        self.embedding_model = Doc2Vec.load(embed_model_path)

        # Classifier 모델 로딩
        print('Loading Classifier model')
        with open(cls_model_path, 'rb') as fp:
            self.clf_model = pickle.load(fp)
示例#4
0
from koalanlp.Util import initialize, finalize
from koalanlp.proc import SentenceSplitter, Tagger
from koalanlp import API
from sentence_split import run_evaluate

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('datasets', nargs='+')
    parser.add_argument('--write_result')
    parser.add_argument('--write_err')
    parser.add_argument(
        '--backend',
        default='OKT',
        choices=['OKT', 'HNN', 'KMR', 'RHINO', 'EUNJEON', 'ARIRANG', 'KKMA'])
    args = parser.parse_args()

    initialize(**{args.backend: 'LATEST'})
    if args.backend in ('OKT', 'HNN'):
        splitter = SentenceSplitter(getattr(API, args.backend))
    else:
        tagger = Tagger(getattr(API, args.backend))
        splitter = lambda text: [sent.surfaceString() for sent in tagger(text)]

    splitter("foo-bar")  # warm-up

    for dataset in args.datasets:
        run_evaluate(dataset, splitter, args.write_result, args.write_err)

    finalize()
from koalanlp.Util import initialize
from koalanlp.proc import Tagger
from koalanlp import API
from nltk.tokenize import word_tokenize
import re

initialize(EUNJEON='LATEST')

tagger = Tagger(API.EUNJEON)

with open('/Users/rjsgm/PycharmProjects/Covefefe_kor/input_folder/a.txt',
          'r',
          encoding='UTF-8') as file_object:
    contents = file_object.read()

filename1 = 'stopwords.txt'
with open(filename1, 'r', encoding='UTF-8') as file_object:
    contents1 = file_object.read()

f_pos = open('/Users/rjsgm/PycharmProjects/Covefefe_kor/output_folder/pos.txt',
             'w',
             encoding='UTF-8')
f_token = open(
    '/Users/rjsgm/PycharmProjects/Covefefe_kor/output_folder/token.txt',
    'w',
    encoding='UTF-8')

stopwords = contents1.split('\n')
word_tokens = word_tokenize(contents)

result = ""