예제 #1
0
    def preprocess(self, corpus, language='ko'):
        pipeline = None

        if language == 'ko':
            mecab_path = 'C:\\mecab\\mecab-ko-dic'
            pipeline = ptm.Pipeline(
                ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path),
                ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(),
                ptm.ngram.NGramTokenizer(1, 2),
                ptm.helper.StopwordFilter(
                    file='../stopwords/stopwordsKor.txt'))
        elif language == 'en':
            pipeline = ptm.Pipeline(
                ptm.splitter.NLTK(), ptm.tokenizer.WordPos(),
                ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(),
                ptm.ngram.NGramTokenizer(1, 2),
                ptm.helper.StopwordFilter(
                    file='../stopwords/stopwordsEng.txt'))
        result = pipeline.processCorpus(corpus)
        print('==  ==')

        documents = []
        for doc in result:
            document = ''
            for sent in doc:
                document += " ".join(sent)
            documents.append(document)

        return documents
예제 #2
0
def vectorizeCaseTwo():
    corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2)

    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
        ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(),
        ptm.ngram.NGramTokenizer(2, 2),
        ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
    result = pipeline.processCorpus(corpus)
    print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==')
    print(result)
    print()

    print('==  ==')

    documents = []
    for doc in result:
        document = ''
        for sent in doc:
            document += " ".join(sent)
        documents.append(document)

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)
    print(vectorizer.get_feature_names())
    print(X.shape)

    print(X.toarray())

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(documents)
    print(vectorizer.get_feature_names())
    print(len(vectorizer.get_feature_names()))
    print(X.toarray())
예제 #3
0
    '정 전 비서관을 "정 과장님"으로 부르며 반말을 하는 남자인데요. 최순실 씨처럼 정 전 비서관을 하대하고 있습니다. 또 청와대 내부 정보를 알고 있는 듯하고 또 인사에까지 개입하려고 하고 있습니다. 그렇기 때문에 정윤회 씨로 추정은 됩니다만 확인은 되지 않습니다.'
]

keyword = keyword_extractor(sents)
for word, r in sorted(keyword.items(), key=lambda x: x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))

corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2)

# import nltk
# nltk.download()
# 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다.
# 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다.
# 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다.
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'),
    ptm.helper.SelectWordOnly(),
    ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
result = pipeline.processCorpus(corpus)
print(result)
print()

documents = []
for doc in result:
    document = ''
    for sent in doc:
        document = " ".join(sent)
    documents.append(document)

keyword_extractor1 = ptm.keyword.KeywordExtractionKorean(
    min_count, max_length, beta, max_iter, verbose, num_words)
keyword1 = keyword_extractor1(documents)
예제 #4
0
# ptm.tokenizer.Komoran나 ptm.tokenizer.TwitterKorean을 사용해 형태소 분석이 가능합니다.
# 형태소 분석 이후 품사가 NN으로 시작하는 명사들만 추출하고, 단어만 골라내 출력하도록 해봅시다.

#import nltk
#nltk.download('punkt')

#pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
#                        ptm.helper.POSFilter('NN*'),
#                        ptm.helper.SelectWordOnly(),
#                        ptm.ngram.NGramTokenizer(3),
#                        ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
#                        )

pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(),
    ptm.segmentation.SegmentationKorean(
        '../model/korean_segmentation_model.crfsuite'),
    ptm.ngram.NGramTokenizer(3),
    ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt'))

result = pipeline.processCorpus(corpus)

with io.open("../demofile.csv", 'w', encoding='utf8') as f:
    for doc in result:
        for sent in doc:
            f.write('\t'.join(sent) + "\n")

print('== 문장 분리 + 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 구 추출 ==')
print(result)
print()
import nltk

class EnglishDictionarySentimentAnalyzer:
    def __init__(self):
        name = 'EnglishDictionarySentimentAnalyzer'

    def createDictionary(self):
        nltk.download('sentiwordnet')


if __name__ == '__main__':

    corpus = ptm.CorpusFromFile('../data/sampleEng.txt')
    pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
                            #ptm.tokenizer.Word(), #ptm.tokenizer.WordPos()
                            #ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'),
                            #ptm.tagger.NLTK(),
                            ptm.tokenizer.WordPos(),
                            ptm.lemmatizer.WordNet())

    result = pipeline.processCorpus(corpus)

    EnglishDictionarySentimentAnalyzer().createDictionary()

    for doc in result:
        grand_score = 0.0
        count = 0
        for sent in doc:
            for _str in sent:
                _str[0]
                _str[1]
                pos = ''
예제 #6
0
    corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file,
                                                               delimiter=',',
                                                               doc_index=4,
                                                               class_index=1,
                                                               title_index=3)
    tups = corpus.pair_map
    class_list = []
    for id in tups:
        # print(tups[id])
        class_list.append(tups[id])

    mecab_path = 'C:\\mecab\\mecab-ko-dic'
    pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
                            ptm.tokenizer.MeCab(mecab_path),
                            ptm.helper.POSFilter('NN*'),
                            ptm.helper.SelectWordOnly(),
                            ptm.ngram.NGramTokenizer(1, 2),
                            ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')
                            )
    result = pipeline.processCorpus(corpus)
    print('==  ==')

    with open("./model/id_to_category.json") as handle:
        id_to_category = json.loads(handle.read())

    # 0 - economy 1 - IT 2 - politics
    category = []
    documents = []
    idx = 0
    for doc in result:
        document = ''
예제 #7
0
        dump(dataset, open(filename, 'wb'))
        print('Saved: %s' % filename)

    def load_dataset(self, filename):
        # load the model from disk
        loaded_model = pickle.load(open(filename, 'rb'))
        return loaded_model


if __name__ == '__main__':

    _negative_docs = ptm.CorpusFromDirectory('txt_sentoken/neg', True)
    _positive_docs = ptm.CorpusFromDirectory('txt_sentoken/pos', True)

    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(), ptm.tokenizer.Word(),
        ptm.helper.StopwordFilter(file='../stopwordsEng.txt'),
        ptm.stemmer.Porter())
    _neg_result = pipeline.processCorpus(_negative_docs)
    _pos_result = pipeline.processCorpus(_positive_docs)
    print(
        '== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Porter =='
    )
    print(_neg_result)
    print()

    negative_docs = list()
    for doc in _neg_result:
        new_doc = []
        for sent in doc:
            for _str in sent:
                if len(_str) > 0:
예제 #8
0
import nltk
import treform as ptm
from nltk.draw.tree import draw_trees
from nltk import tree, treetransforms
from copy import deepcopy

pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
                        ptm.tokenizer.Komoran(),
                        ptm.syntactic_parser.BeneparSyntacticParser()
                        )
corpus = ptm.CorpusByDataFrame('../sample_data/parser_sample.txt', '\t', 0, header=False)
#corpus = ptm.CorpusFromFieldDelimitedFile('../sample_data/parser_sample.txt', 0)
print(corpus.docs)

trees = pipeline.processCorpus(corpus)

for tree in trees:
    print(tree[0])
    t = nltk.Tree.fromstring(tree[0])
    draw_trees(t)
예제 #9
0
import treform as ptm

pipeline = None
corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2)
mecab_path = 'C:\\mecab\\mecab-ko-dic'
mode = 'korean_lemmatizer'
if mode is not 'korean_lemmatizer':
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(),
        ptm.tokenizer.MeCab(mecab_path),
        #ptm.tokenizer.Komoran(),
        ptm.helper.SelectWordOnly(),
        ptm.ngram.NGramTokenizer(1, 2, concat=' '),
        ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
else:
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(),
        ptm.tokenizer.MeCab(mecab_path),
        #ptm.tokenizer.Komoran(),
        ptm.lemmatizer.SejongPOSLemmatizer(),
        ptm.helper.SelectWordOnly(),
        # ptm.ngram.NGramTokenizer(1, 2, concat=' '))
        ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))

documents = [
    '오늘은 비가와서 그런지 매우 우울하다', '시험이 끝나야 놀지 스트레스 받아ㅠㅠ', '행복한 하루의 끝이라 아름답고 좋네!',
    '더운날에는 아이스 커피가 최고지~~!'
]

#result = pipeline.processCorpus(corpus)
result = pipeline.processCorpus(documents)
예제 #10
0
    ]
    return ({key: fdist[key] for key in word_only_keys})


print(str(platform.system()).lower())
if str(platform.system()).lower().startswith('win'):
    # Window의 경우 폰트 경로
    font_path = 'C:/Windows/Fonts/malgun.ttf'
elif str(platform.system()).lower().startswith('mac'):
    #for Mac
    font_path = '/Library/Fonts/AppleGothic.ttf'

pipeline = ptm.Pipeline(
    ptm.splitter.KoSentSplitter(),
    ptm.tokenizer.Komoran(),
    #ptm.tokenizer.WordPos(),
    ptm.helper.POSFilter('NN*'),
    ptm.helper.SelectWordOnly(),
    ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt'),
    ptm.counter.WordCounter())

corpus = ptm.CorpusFromFile('../data/sampleKor.txt')

result = pipeline.processCorpus(corpus)

print(result)
print()

doc_collection = ''
term_counts = {}
for doc in result:
    for sent in doc:
예제 #11
0
import treform as ptm

corpus = ptm.CorpusFromFile('../sample_data/sampleEng.txt')

#pipeline example 1
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.Word(),
    ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'),
    ptm.stemmer.Porter())
result = pipeline.processCorpus(corpus)
print(
    '== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Porter =='
)
print(result)
print()

#pipeline example 2
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.Word(),
    ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'),
    ptm.stemmer.Lancaster())
result = pipeline.processCorpus(corpus)
print(
    '== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Lancaster =='
)
print(result)
print()

#pipeline example 3
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.Word(),
예제 #12
0
    def preprocessing(self, mode, path, stopword_file, files, is_directory=False, doc_index=-1, max=-1):
        util = ptm.Utility()
        # mode is either filtered or unfiltered or simple
        corpus = []
        if mode == 'unfiltered':
            # path = '/usr/local/lib/mecab/dic/mecab-ko-dic'
            pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
                                    ptm.tokenizer.MeCab(path),
                                    ptm.lemmatizer.SejongPOSLemmatizer(),
                                    ptm.helper.SelectWordOnly(),
                                    ptm.helper.StopwordFilter(file=stopword_file))

            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    document = []
                    for sent in doc:
                        for word in sent:
                            document.append(word)
                    self.documents.append(document)

        elif mode == 'filtered':
            pipeline = ptm.Pipeline(ptm.tokenizer.Word())
            # corpus = ptm.CorpusFromFile('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')
            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break
                        corpus = ptm.Corpus(docs)

            self.documents = pipeline.processCorpus(corpus)

        elif mode == 'jamo_split_unfiltered':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
                                    ptm.tokenizer.MeCab(path),
                                    ptm.lemmatizer.SejongPOSLemmatizer(),
                                    ptm.helper.SelectWordOnly(),
                                    ptm.helper.StopwordFilter(file=stopword_file))

            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    for sent in doc:
                        _sent = ''
                        for word in sent:
                            _sent += word + ' '
                        _sent = _sent.strip()
                        _sent = util.jamo_sentence(_sent)
                        toks = _sent.split()
                        if len(toks) > 10:
                            self.documents.append(toks)

        elif mode == 'jamo_split_filtered':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            pipeline = ptm.Pipeline(ptm.tokenizer.Word())
            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index)
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file)
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    _sent = ''
                    for word in doc:
                        _sent += word + ' '
                    _sent = _sent.strip()
                    _sent = util.jamo_sentence(_sent)
                    toks = _sent.split()
                    if len(toks) > 10:
                        self.documents.append(toks)

        elif mode == 'simple':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            count = 0
            for line in open(files[0], encoding='utf-8'):
                if doc_index != -1:
                    line = line.split()[doc_index]
                toks = line.split()
                if len(toks) > 10:
                    self.documents.append(toks)
                    count += 1

                if count % 10000 == 0:
                    print('processing... ' + str(count))

        print('Document size for the total dataset: ' + str(len(self.documents)))
예제 #13
0
    return _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels


if language == 'en':
    _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs \
                                                                         = read_english_corpus()
elif language == 'ko':
    _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels \
                                                                         = read_korean_corpus()
if language == 'ko':
    mecab_path = 'C:\\mecab\\mecab-ko-dic'
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(),
        ptm.tokenizer.Komoran(),
        #ptm.tokenizer.MeCab(mecab_path),
        ptm.helper.POSFilter('NN*|V*|IC*'),
        ptm.helper.SelectWordOnly(),
        ptm.ngram.NGramTokenizer(1, 2),
        ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt'))
elif language == 'en':
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(),
        ptm.tokenizer.WordPos(),
        ptm.helper.POSFilter('NN*|A*|V*|J*'),
        ptm.helper.SelectWordOnly(),
        #ptm.ngram.NGramTokenizer(1, 2),
        ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'))


def make_documents(result):
    docs = []