def preprocess(self, corpus, language='ko'): pipeline = None if language == 'ko': mecab_path = 'C:\\mecab\\mecab-ko-dic' pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(1, 2), ptm.helper.StopwordFilter( file='../stopwords/stopwordsKor.txt')) elif language == 'en': pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.WordPos(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(1, 2), ptm.helper.StopwordFilter( file='../stopwords/stopwordsEng.txt')) result = pipeline.processCorpus(corpus) print('== ==') documents = [] for doc in result: document = '' for sent in doc: document += " ".join(sent) documents.append(document) return documents
def vectorizeCaseTwo(): corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2) pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(2, 2), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) result = pipeline.processCorpus(corpus) print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==') print(result) print() print('== ==') documents = [] for doc in result: document = '' for sent in doc: document += " ".join(sent) documents.append(document) vectorizer = CountVectorizer() X = vectorizer.fit_transform(documents) print(vectorizer.get_feature_names()) print(X.shape) print(X.toarray()) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(documents) print(vectorizer.get_feature_names()) print(len(vectorizer.get_feature_names())) print(X.toarray())
'정 전 비서관을 "정 과장님"으로 부르며 반말을 하는 남자인데요. 최순실 씨처럼 정 전 비서관을 하대하고 있습니다. 또 청와대 내부 정보를 알고 있는 듯하고 또 인사에까지 개입하려고 하고 있습니다. 그렇기 때문에 정윤회 씨로 추정은 됩니다만 확인은 되지 않습니다.' ] keyword = keyword_extractor(sents) for word, r in sorted(keyword.items(), key=lambda x: x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r)) corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2) # import nltk # nltk.download() # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다. # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다. # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다. pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) result = pipeline.processCorpus(corpus) print(result) print() documents = [] for doc in result: document = '' for sent in doc: document = " ".join(sent) documents.append(document) keyword_extractor1 = ptm.keyword.KeywordExtractionKorean( min_count, max_length, beta, max_iter, verbose, num_words) keyword1 = keyword_extractor1(documents)
# ptm.tokenizer.Komoran나 ptm.tokenizer.TwitterKorean을 사용해 형태소 분석이 가능합니다. # 형태소 분석 이후 품사가 NN으로 시작하는 명사들만 추출하고, 단어만 골라내 출력하도록 해봅시다. #import nltk #nltk.download('punkt') #pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), # ptm.helper.POSFilter('NN*'), # ptm.helper.SelectWordOnly(), # ptm.ngram.NGramTokenizer(3), # ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') # ) pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.segmentation.SegmentationKorean( '../model/korean_segmentation_model.crfsuite'), ptm.ngram.NGramTokenizer(3), ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')) result = pipeline.processCorpus(corpus) with io.open("../demofile.csv", 'w', encoding='utf8') as f: for doc in result: for sent in doc: f.write('\t'.join(sent) + "\n") print('== 문장 분리 + 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 구 추출 ==') print(result) print()
import nltk class EnglishDictionarySentimentAnalyzer: def __init__(self): name = 'EnglishDictionarySentimentAnalyzer' def createDictionary(self): nltk.download('sentiwordnet') if __name__ == '__main__': corpus = ptm.CorpusFromFile('../data/sampleEng.txt') pipeline = ptm.Pipeline(ptm.splitter.NLTK(), #ptm.tokenizer.Word(), #ptm.tokenizer.WordPos() #ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'), #ptm.tagger.NLTK(), ptm.tokenizer.WordPos(), ptm.lemmatizer.WordNet()) result = pipeline.processCorpus(corpus) EnglishDictionarySentimentAnalyzer().createDictionary() for doc in result: grand_score = 0.0 count = 0 for sent in doc: for _str in sent: _str[0] _str[1] pos = ''
corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file, delimiter=',', doc_index=4, class_index=1, title_index=3) tups = corpus.pair_map class_list = [] for id in tups: # print(tups[id]) class_list.append(tups[id]) mecab_path = 'C:\\mecab\\mecab-ko-dic' pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(1, 2), ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt') ) result = pipeline.processCorpus(corpus) print('== ==') with open("./model/id_to_category.json") as handle: id_to_category = json.loads(handle.read()) # 0 - economy 1 - IT 2 - politics category = [] documents = [] idx = 0 for doc in result: document = ''
dump(dataset, open(filename, 'wb')) print('Saved: %s' % filename) def load_dataset(self, filename): # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) return loaded_model if __name__ == '__main__': _negative_docs = ptm.CorpusFromDirectory('txt_sentoken/neg', True) _positive_docs = ptm.CorpusFromDirectory('txt_sentoken/pos', True) pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='../stopwordsEng.txt'), ptm.stemmer.Porter()) _neg_result = pipeline.processCorpus(_negative_docs) _pos_result = pipeline.processCorpus(_positive_docs) print( '== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Porter ==' ) print(_neg_result) print() negative_docs = list() for doc in _neg_result: new_doc = [] for sent in doc: for _str in sent: if len(_str) > 0:
import nltk import treform as ptm from nltk.draw.tree import draw_trees from nltk import tree, treetransforms from copy import deepcopy pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.syntactic_parser.BeneparSyntacticParser() ) corpus = ptm.CorpusByDataFrame('../sample_data/parser_sample.txt', '\t', 0, header=False) #corpus = ptm.CorpusFromFieldDelimitedFile('../sample_data/parser_sample.txt', 0) print(corpus.docs) trees = pipeline.processCorpus(corpus) for tree in trees: print(tree[0]) t = nltk.Tree.fromstring(tree[0]) draw_trees(t)
import treform as ptm pipeline = None corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2) mecab_path = 'C:\\mecab\\mecab-ko-dic' mode = 'korean_lemmatizer' if mode is not 'korean_lemmatizer': pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), #ptm.tokenizer.Komoran(), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(1, 2, concat=' '), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) else: pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), #ptm.tokenizer.Komoran(), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), # ptm.ngram.NGramTokenizer(1, 2, concat=' ')) ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) documents = [ '오늘은 비가와서 그런지 매우 우울하다', '시험이 끝나야 놀지 스트레스 받아ㅠㅠ', '행복한 하루의 끝이라 아름답고 좋네!', '더운날에는 아이스 커피가 최고지~~!' ] #result = pipeline.processCorpus(corpus) result = pipeline.processCorpus(documents)
] return ({key: fdist[key] for key in word_only_keys}) print(str(platform.system()).lower()) if str(platform.system()).lower().startswith('win'): # Window의 경우 폰트 경로 font_path = 'C:/Windows/Fonts/malgun.ttf' elif str(platform.system()).lower().startswith('mac'): #for Mac font_path = '/Library/Fonts/AppleGothic.ttf' pipeline = ptm.Pipeline( ptm.splitter.KoSentSplitter(), ptm.tokenizer.Komoran(), #ptm.tokenizer.WordPos(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt'), ptm.counter.WordCounter()) corpus = ptm.CorpusFromFile('../data/sampleKor.txt') result = pipeline.processCorpus(corpus) print(result) print() doc_collection = '' term_counts = {} for doc in result: for sent in doc:
import treform as ptm corpus = ptm.CorpusFromFile('../sample_data/sampleEng.txt') #pipeline example 1 pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'), ptm.stemmer.Porter()) result = pipeline.processCorpus(corpus) print( '== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Porter ==' ) print(result) print() #pipeline example 2 pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'), ptm.stemmer.Lancaster()) result = pipeline.processCorpus(corpus) print( '== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Lancaster ==' ) print(result) print() #pipeline example 3 pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Word(),
def preprocessing(self, mode, path, stopword_file, files, is_directory=False, doc_index=-1, max=-1): util = ptm.Utility() # mode is either filtered or unfiltered or simple corpus = [] if mode == 'unfiltered': # path = '/usr/local/lib/mecab/dic/mecab-ko-dic' pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopword_file)) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type(corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: document = [] for sent in doc: for word in sent: document.append(word) self.documents.append(document) elif mode == 'filtered': pipeline = ptm.Pipeline(ptm.tokenizer.Word()) # corpus = ptm.CorpusFromFile('/Data/ko_sns_comments/naver_comments15_16_filtered.txt') for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) self.documents = pipeline.processCorpus(corpus) elif mode == 'jamo_split_unfiltered': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopword_file)) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type(corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: for sent in doc: _sent = '' for word in sent: _sent += word + ' ' _sent = _sent.strip() _sent = util.jamo_sentence(_sent) toks = _sent.split() if len(toks) > 10: self.documents.append(toks) elif mode == 'jamo_split_filtered': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) pipeline = ptm.Pipeline(ptm.tokenizer.Word()) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile(a_file, doc_index) elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file) elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type(corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: _sent = '' for word in doc: _sent += word + ' ' _sent = _sent.strip() _sent = util.jamo_sentence(_sent) toks = _sent.split() if len(toks) > 10: self.documents.append(toks) elif mode == 'simple': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) count = 0 for line in open(files[0], encoding='utf-8'): if doc_index != -1: line = line.split()[doc_index] toks = line.split() if len(toks) > 10: self.documents.append(toks) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) print('Document size for the total dataset: ' + str(len(self.documents)))
return _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels if language == 'en': _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs \ = read_english_corpus() elif language == 'ko': _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels \ = read_korean_corpus() if language == 'ko': mecab_path = 'C:\\mecab\\mecab-ko-dic' pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), #ptm.tokenizer.MeCab(mecab_path), ptm.helper.POSFilter('NN*|V*|IC*'), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(1, 2), ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')) elif language == 'en': pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.WordPos(), ptm.helper.POSFilter('NN*|A*|V*|J*'), ptm.helper.SelectWordOnly(), #ptm.ngram.NGramTokenizer(1, 2), ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt')) def make_documents(result): docs = []