示例#1
0
def vectorizeCaseTwo():
    corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2)

    pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
                            ptm.tokenizer.Komoran(),
                            ptm.helper.POSFilter('NN*'),
                            ptm.helper.SelectWordOnly(),
                            ptm.ngram.NGramTokenizer(2, 2),
                            ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
                            )
    result = pipeline.processCorpus(corpus)
    print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==')
    print(result)
    print()

    print('==  ==')

    documents = []
    for doc in result:
        document = ''
        for sent in doc:
            document += " ".join(sent)
        documents.append(document)

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)
    print(vectorizer.get_feature_names())
    print(X.shape)

    print(X.toarray())

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(documents)
    print(vectorizer.get_feature_names())
    print(len(vectorizer.get_feature_names()))
    print(X.toarray())
示例#2
0
import pyTextMiner as ptm

_stopwords = []
with open("./stopwords/stopwordsKor.txt", encoding='utf-8') as file:
    for line in file:
        line = line.strip()  #or some other preprocessing
        _stopwords.append(line)  #storing everything in memory!

path = 'C:\\mecab\\mecab-ko-dic'
#pos_tagger_name - either komoran, okt, nltk
#lang = ko or en
pipeline = ptm.Pipeline(
    ptm.keyword.TextRankExtractor(pos_tagger_name='mecab',
                                  mecab_path=path,
                                  max=5,
                                  lang='ko',
                                  stopwords=_stopwords,
                                  combined_keywords=True))

corpus = ptm.CorpusFromFile('./data/sampleKor.txt')
result = pipeline.processCorpus(corpus)
print('== Splitting Sentence ==')
print(result)
print()

from sklearn.datasets import fetch_20newsgroups
ng20 = fetch_20newsgroups(subset='all',
                          remove=('headers', 'footers', 'quotes'))

print("XXXX " + str(ng20.data[0]))
import pyTextMiner as ptm

#corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2)
corpus = ptm.CorpusFromFile('./data/134963_norm.txt')
# import nltk
# nltk.download()
# 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다.
# 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다.
# 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다.
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'),
    ptm.helper.SelectWordOnly(),
    ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
result = pipeline.processCorpus(corpus)
print(result)
print()

documents = []
for doc in result:
    document = ''
    for sent in doc:
        document = " ".join(sent)
    documents.append(document)

#2016-10-20.txt
corpus1 = ptm.CorpusFromFile('./data/2016-10-20.txt')
noun_extractor = ptm.noun_extractor.NounExtractionKorean(corpus1)
sent = '두바이월드센터시카고옵션거래소'
result = noun_extractor.__call__(sent)
print(result)
from document_classification.ml_textclassification import documentClassifier
import pyTextMiner as ptm

if __name__ == '__main__':
    document_classifier = documentClassifier()
    mecab_path = 'C:\\mecab\\mecab-ko-dic'
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(),
        ptm.tokenizer.MeCab(mecab_path),
        ptm.helper.POSFilter('NN*'),
        ptm.helper.SelectWordOnly(),
        ptm.ngram.NGramTokenizer(2, 2),
        #ptm.tokenizer.LTokenizerKorean(),
        ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt'))

    #mode is either train or predict
    mode = 'train'
    if mode is 'train':
        input_file = './data/3_class_naver_news.csv'
        # 1. text processing and representation
        corpus = ptm.CorpusFromFieldDelimitedFileForClassification(
            input_file,
            delimiter=',',
            doc_index=4,
            class_index=1,
            title_index=3)
        corpus.docs
        tups = corpus.pair_map
        class_list = []
        for id in tups:
            #print(tups[id])
示例#5
0
    test_sample = '한국 경제가 위기에 처하다'
    # Convert the sample document into a list and use the infer_vector method to get a vector representation for it
    new_doc_words = test_sample.split()
    similars = doc2vec.most_similar(test_sample)
    for sim in similars:
        print(str(sim))

    mecab_path = 'C:\\mecab\\mecab-ko-dic'
    # stopwords file path
    stopwords = '../stopwords/stopwordsKor.txt'

    test_sample1 = '중국 시장은 위축되었다'

    pipeline = ptm.Pipeline(ptm.tokenizer.MeCab(mecab_path),
                            ptm.lemmatizer.SejongPOSLemmatizer(),
                            ptm.helper.SelectWordOnly(),
                            ptm.helper.StopwordFilter(file=stopwords))

    doc_vec1 = pipeline.processCorpus([test_sample])
    doc_vec2 = pipeline.processCorpus([test_sample1])

    print(doc_vec1[0])
    print(doc_vec2[0])

    # use the most_similar utility to find the most similar documents.
    similarity = doc2vec.compute_similarity_vec(first_vec=doc_vec1[0],
                                                second_vec=doc_vec2[0])
    print('similarity between two document: ')
    print(str(similarity))
示例#6
0
    def preprocessing(self,
                      mode,
                      path,
                      stopword_file,
                      files,
                      is_directory=False,
                      doc_index=-1,
                      max=-1):
        util = ptm.Utility()
        # mode is either filtered or unfiltered or simple
        corpus = []
        if mode == 'unfiltered':
            # path = '/usr/local/lib/mecab/dic/mecab-ko-dic'
            pipeline = ptm.Pipeline(
                ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path),
                ptm.lemmatizer.SejongPOSLemmatizer(),
                ptm.helper.SelectWordOnly(),
                ptm.helper.StopwordFilter(file=stopword_file))

            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(
                    corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    document = []
                    for sent in doc:
                        for word in sent:
                            document.append(word)
                    self.documents.append(document)

        elif mode == 'filtered':
            pipeline = ptm.Pipeline(ptm.tokenizer.Word())
            # corpus = ptm.CorpusFromFile('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')
            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break
                        corpus = ptm.Corpus(docs)

            self.documents = pipeline.processCorpus(corpus)

        elif mode == 'jamo_split_unfiltered':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            pipeline = ptm.Pipeline(
                ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path),
                ptm.lemmatizer.SejongPOSLemmatizer(),
                ptm.helper.SelectWordOnly(),
                ptm.helper.StopwordFilter(file=stopword_file))

            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(
                    corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    for sent in doc:
                        _sent = ''
                        for word in sent:
                            _sent += word + ' '
                        _sent = _sent.strip()
                        _sent = util.jamo_sentence(_sent)
                        toks = _sent.split()
                        if len(toks) > 10:
                            self.documents.append(toks)

        elif mode == 'jamo_split_filtered':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            pipeline = ptm.Pipeline(ptm.tokenizer.Word())
            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index)
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file)
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(
                    corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    _sent = ''
                    for word in doc:
                        _sent += word + ' '
                    _sent = _sent.strip()
                    _sent = util.jamo_sentence(_sent)
                    toks = _sent.split()
                    if len(toks) > 10:
                        self.documents.append(toks)

        elif mode == 'simple':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            count = 0
            for line in open(files[0], encoding='utf-8'):
                if doc_index != -1:
                    line = line.split()[doc_index]
                toks = line.split()
                if len(toks) > 10:
                    self.documents.append(toks)
                    count += 1

                if count % 10000 == 0:
                    print('processing... ' + str(count))

        print('Document size for the total dataset: ' +
              str(len(self.documents)))
示例#7
0
import pyTextMiner as ptm

dictionary_path='./dict/user_dic.txt'
pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
                        ptm.tokenizer.Komoran(userdic=dictionary_path),
                        ptm.helper.POSFilter('NN*'),
                        ptm.helper.SelectWordOnly(),
                        #ptm.tokenizer.MaxScoreTokenizerKorean(),
                        #ptm.tokenizer.Word(),
                        ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
                        #ptm.ngram.NGramTokenizer(2,3),
                        #ptm.counter.WordCounter())

corpus = ptm.CorpusFromEojiFile('./data/filtered_content.txt')
#result = pipeline.processCorpus(corpus)

#print(result)
print()

import numpy as np
print(np.__version__)

s = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습."


pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter())
corpus = [s]
result = pipeline.processCorpus(corpus)
print(result)
示例#8
0
import pyTextMiner as ptm

#model Google News, run once to download pre-trained vectors
#!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
model = gensim.models.KeyedVectors.load_word2vec_format(
    '../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

# Fetch ng20 dataset
ng20 = fetch_20newsgroups(subset='all',
                          remove=('headers', 'footers', 'quotes'))
# text and ground truth labels
texts, y = ng20.data, ng20.target

#corpus = [preprocess(text) for text in texts]
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.Word(),
    ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'),
    ptm.stemmer.Porter())
result = pipeline.processCorpus(texts)
corpus = []
for doc in result:
    document = []
    for sent in doc:
        for word in sent:
            document.append(word)
    corpus.append(document)


# ### Remove empty docs
def filter_docs(corpus, texts, labels, condition_on_doc):
    """
    Filter corpus, texts and labels given the function condition_on_doc which takes
import multiprocessing
from time import time

import gensim
import pyTextMiner as ptm
from gensim.models import Word2Vec

cores = multiprocessing.cpu_count()  # Count the number of cores in a computer

print('Start reading the dataset 1....')
path = '/usr/local/lib/mecab/dic/mecab-ko-dic'

pipeline = ptm.Pipeline(
    ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path),
    ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(),
    ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))

corpus = ptm.CorpusFromFieldDelimitedEmojiFile('/Data/ko_sns_comments/xab', 1)
result1 = pipeline.processCorpus(corpus)

print('Finish processing... ')

i = 0
file = open("naver_comments15_16_filtered.txt", "a+")
for doc in result1:
    if i % 10000 == 0:
        print('processing ' + str(i))
    i += 1
    document = ''
    for sent in doc:
        for word in sent:
示例#10
0
import nltk


class EnglishDictionarySentimentAnalyzer:
    def __init__(self):
        name = 'EnglishDictionarySentimentAnalyzer'

    def createDictionary(self):
        nltk.download('sentiwordnet')


if __name__ == '__main__':

    corpus = ptm.CorpusFromFile('./data/sampleEng.txt')
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(), ptm.tokenizer.Word(),
        ptm.helper.StopwordFilter(file='./stopwords/stopwordsEng.txt'),
        ptm.tagger.NLTK(), ptm.lemmatizer.WordNet())

    result = pipeline.processCorpus(corpus)

    EnglishDictionarySentimentAnalyzer().createDictionary()

    for doc in result:
        for sent in doc:
            for _str in sent:
                _str[0]
                _str[1]
                pos = ''
                if (str(_str[1]).startswith("N")):
                    pos = 'n'
                elif (str(_str[1]).startswith("A")):
    file_name = './data/emo_positive.txt'
    sentiAnalyzer.readPositiveEmotiDictionary(file_name)
    file_name = './data/polarity.csv'
    sentiAnalyzer.readPolarityDictionary(file_name)

    dict_list = sentiAnalyzer.getSentiDictionary()

    pipeline = None
    corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2)
    mecab_path = 'C:\\mecab\\mecab-ko-dic'
    mode = 'korean_lemmatizer'
    if mode is not 'korean_lemmatizer':
        pipeline = ptm.Pipeline(
            ptm.splitter.NLTK(),
            ptm.tokenizer.MeCab(mecab_path),
            #ptm.tokenizer.Komoran(),
            ptm.helper.SelectWordOnly(),
            #ptm.ngram.NGramTokenizer(1,2,concat=' '),
            ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt'))
    else:
        pipeline = ptm.Pipeline(
            ptm.splitter.NLTK(),
            ptm.tokenizer.MeCab(mecab_path),
            #ptm.tokenizer.Komoran(),
            ptm.lemmatizer.SejongPOSLemmatizer(),
            ptm.helper.SelectWordOnly(),
            #ptm.ngram.NGramTokenizer(1, 2, concat=' '),
            ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt'))

    documents = [
        '오늘은 비가와서 그런지 매우 우울하다',
示例#12
0
def preprocessing_english():
    tweet_raw = pd.read_csv('./SuperbowlData/클린하고유저거름/user정리_{}_2019_09_23_to_2019_10_02.csv'.format(brand), encoding = 'utf-8', header = 0)
    #cols = ['sentiment', 'id', 'date', 'query_string', 'user', 'text'] #this is for sentiment140 preprocess
    #colds = ['date','time','user_name','text','link','retweet_counts','favorite_counts'] #this is for collected tweets data via GOT3
    #tweet_raw = pd.read_csv("./data/sentiment140_training.1600000.processed.noemoticon.csv", encoding = 'latin-1', header = None, names = cols) #this is for sentiment140 data preprocess
    #tweet_raw = pd.read_csv("./SuperbowlData/클린하고유저거름/필터_clear_Kia_twitter_data_2019-01-31_to_2019-02-07.csv", encoding = 'utf-8', header = 0)
    print(tweet_raw['text'][:5])
    #tweet_raw = tweet_raw[0:5] #for test
    corpus = []
    for i in range(len(tweet_raw)):
        doc = str(tweet_raw['text'][i])
        doc = doc.replace("[", "").replace("]", "").replace("%", "").replace("? 셳", "'t").replace("?셳", "'t").replace("? 셲", "'s").replace("?셲", "'s")\
            .replace("? 쁥", "'h").replace("?쁥", "'h").replace("? 쁲", "'s").replace("?쁲", "'s").replace("? 셱", "'r").replace("?셱", "'r").replace("? 쁳", "'t").replace("?쁳", "'t")\
            .replace("? 셫", "'m").replace("?셫", "'m").replace("? 쁶", "'w").replace("?쁶", "'w").replace("? 쐏", "'p").replace("?쐏", "'p").replace("? 쐌", "'M").replace("?쐌", "'M")\
            .replace("? 셙", "'a").replace("?셙", "'a").replace("? 쏧", "'I").replace("?쏧", "'I").replace("훮", "ā").replace("? 셶", "'v").replace("?셶", "'v")\
            .replace("? 쏷", "'T").replace("?쏷", "'T").replace("? 쏝", "'B").replace("?쏝", "'B").replace("? 셪", "'l").replace("?셪", "'l").replace("? 쐙", "'y").replace("?쐙", "'y")\
            .replace("짙", "£").replace("?쏪", "'J").replace("챕", "é").replace("? 쏻", "'W").replace("?쏻", "'W").replace("? 쐓", "'S").replace("?쐓", "'S")\
            .replace("훮", "ā").replace("? 쐁", "'C").replace("?쐁", "'C").replace("竊쉎", ": h").replace("竊", "(").replace("? 쐌", "'m").replace("?쐌", "'m").\
            replace("? 쒴", "'K").replace("?쒴", "'K").replace("? 쐆", "'h").replace("?쐆", "'h").replace("? 셎", "'S").replace("?셎", "'S").replace("? 쁅", "'F").replace("?쁅", "'F")\
            .replace("? 쐔", "'T").replace("?쐕", "'T").replace("?죛", " s").replace("?쒋?", "'' ").replace("? 쏞", "'C").replace("?쏞", "'C").replace("? 쏱", "'P").replace("?쏱", "'P")\
            .replace("? 셝", "'d").replace("?셝", "'d").replace("? 쏽", "'Y").replace("?쏽", "'Y").replace("? 쏫", "'K").replace("?쏫","'K").replace("? 쏤", "'F").replace("?쏤", "'F")\
            .replace("? 쏦", "'H").replace("?쏦", "'H").replace("&amp;", "and").replace("? 쏺", "'V").replace("?쏺", "'V")\
            .replace("? 쏛", "'A").replace("?쏛", "'A").replace("? 쏡", "'S").replace("?쏡", "'S").replace("? 쐍", "'n").replace("?쐍", "'n").replace("?㏇뇰?▧?", "").replace("? 쐊", "'k").replace("?쐊", "'k")
        doc = re.sub("@[\d|A-Z|a-z|_.]+", "", doc) #사용자태그 삭제
        doc = re.sub("(http|https|ftp|telnet|news|mms)://[^\"'\s()]+", "", doc) #url 삭제
        doc = doc.replace("'ve", " have").replace("'s", " is").replace("n't", " not").replace("'m", " am").replace("'ll", " will").replace("'d", "would")
        doc = doc.lower()
        #doc = re.sub("[^a-zA-Z]", " ", doc) #특수문자 삭제
        doc = re.sub("[^A-Za-z.?!\s]", " ", doc)
        corpus.append("{}".format(doc))  # .split("."))

    pipeline1 = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Word(),
                           ptm.helper.StopwordFilter(file='./stopwordsEng.txt'),
                           ptm.tagger.NLTK(), ptm.lemmatizer.WordNet(),
                           ptm.helper.SelectWordOnly())  ##, kp.ngram.NGramTokenizer())
    #Below: LDA를 위한 다른 파이프라인...
    pipeline2 = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Word(),
                           ptm.helper.StopwordFilter('./data/english_stopwords.txt'),
                           ptm.tagger.NLTK(), ptm.lemmatizer.WordNet(), ptm.helper.POSFilter('N*', 'J*'),
                           ptm.helper.SelectWordOnly())

    result1 = pipeline1.processCorpus(corpus)
    #result2 = pipeline2.processCorpus(corpus)

    f_output = open('./전처리/preprocessed/전처리최종_앱티브_2019{}.csv'.format(brand), 'w', encoding='utf-8', newline='')
    csv_writer = csv.writer(f_output) # quoting = csv.QUOTE_ALL)
    #csv_writer.writerow(['sentiment', 'id', 'date', 'query_string', 'user', 'text'])


    for i, doc in enumerate(result1):
        #doc =  re.sub('[\W]', '', doc) #특수문자 삭제
        # Remove punctuations and numbers
        #doc = re.sub('[^a-zA-Z]', ' ', doc)
        # Single character removal
        #doc = re.sub(r"\s+[a-zA-Z]\s+", ' ', doc)
        # 길이가 2이하인 단어는 제거 (길이가 짧은 단어 제거)
        '''
        for w in doc:
            w = re.sub('[^a-zA-Z]', ' ', w)
            w = re.sub(r"\s+[a-zA-Z]\s+", ' ', w)
            if len(w)>2 :
                doc = doc.append(' '.join(w))
        '''

        #print(i, doc)
        sent = list(map(" ".join, doc))
        #csv_writer.writerow([tweet_raw['sentiment'][i], tweet_raw['id'][i], tweet_raw['date'][i], tweet_raw['query_string'][i], tweet_raw['user'][i],"{}".format(" ".join(sent))]) #not this. don't use
        csv_writer.writerow([tweet_raw['date'][i], tweet_raw['time'][i], tweet_raw['user_name'][i], " ".join(sent)])
示例#13
0
# ptm.tokenizer.Komoran나 ptm.tokenizer.TwitterKorean을 사용해 형태소 분석이 가능합니다.
# 형태소 분석 이후 품사가 NN으로 시작하는 명사들만 추출하고, 단어만 골라내 출력하도록 해봅시다.

#import nltk
#nltk.download('punkt')

#pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
#                        ptm.helper.POSFilter('NN*'),
#                        ptm.helper.SelectWordOnly(),
#                        ptm.ngram.NGramTokenizer(3),
#                        ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
#                        )

pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(),
    ptm.segmentation.SegmentationKorean(
        './model/korean_segmentation_model.crfsuite'),
    ptm.ngram.NGramTokenizer(3),
    ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))

result = pipeline.processCorpus(corpus)

with io.open("demofile.csv", 'w', encoding='utf8') as f:
    for doc in result:
        for sent in doc:
            f.write('\t'.join(sent) + "\n")

print('== 문장 분리 + 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 구 추출 ==')
print(result)
print()