示例#1
0
def classify(text):
    if detect_language(text) != 'ko':
        language = detect_language(text)
        text = translater(text)
    else:
        language = 'ko'
    text = spellchecker(text)
    word = []
    word.append(text)
    tokenizer = Twitter()
    word = [tokenizer.morphs(row) for row in word]
    with open('./model/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    sequences_test = tokenizer.texts_to_sequences(word)
    data_int_t = pad_sequences(sequences_test,
                               padding='pre',
                               maxlen=(MAX_SEQUENCE_LENGTH - 5))
    data_test = pad_sequences(data_int_t,
                              padding='post',
                              maxlen=(MAX_SEQUENCE_LENGTH))
    model = load_model('./model/train_model.h5')
    y_prob = model.predict(data_test)
    for n, prediction in enumerate(y_prob):
        pred = y_prob.argmax(axis=-1)[n]
        if pred < 2.0:
            return ("질문을 이해하지 못했어요. 다시 입력해주세요.")
        else:
            if language == 'ko':
                return (classes[pred])
            else:
                return (translater(classes[pred], language))
示例#2
0
def naver_card_info():
    url = "https://card.search.naver.com/card.naver?singleCardId=20"
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    div_tags = soup.find("div",
                         {"class": "_detail_1 sum_one sum_one_v1 _tab_detail"})
    tr_tags = div_tags.find_all("tr")
    del tr_tags[0]
    t = list(tr_tags[1].strings)
    lists = []
    for temp in t:
        if temp == '\n':
            del temp
        else:
            lists.append(temp)

    twitter = Twitter()
    nouns = twitter.nouns(' '.join(lists))
    pos = twitter.pos(' '.join(lists))
    morph = twitter.morphs(' '.join(lists))
    phrases = twitter.phrases(' '.join(lists))

    count = Counter(nouns)
    print(lists)
    print(pos)
    print(morph)
    print(phrases)
    print(nouns)
    print(count)
class InputProcessor:
    def __init__(self):
        self.twitter = Twitter()

    def morphs(self, input_line):
        morphs = self.twitter.morphs(input_line)
        return morphs
示例#4
0
class CharBaseDataset(Dataset):
    def __init__(self, args, tokenizer, mode):
        super(CharBaseDataset, self).__init__()
        self.tokenizer = tokenizer
        self.word_tokenizer = Twitter()
        self.maxlen = 128
        if "train" in mode:
            data_path = os.path.join(args.data_dir, args.task, args.train_file)
        elif "dev" in mode:
            data_path = os.path.join(args.data_dir, args.task, args.dev_file)
        elif "test" in mode:
            data_path = os.path.join(args.data_dir, args.task, args.test_file)
        self.dataset = pd.read_csv(data_path, encoding="utf8", sep="\t")
        if "small" in mode:
            self.dataset = self.dataset[:10000]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        txt = str(self.dataset.at[idx, "review"])
        data = self.tokenizer(txt,
                              pad_to_max_length=True,
                              max_length=self.maxlen,
                              truncation=True)
        char_token = self.tokenizer._tokenize(txt)
        word_token = self.word_tokenizer.morphs(txt)
        input_ids = torch.LongTensor(data["input_ids"])
        token_type_ids = torch.LongTensor(data["token_type_ids"])
        attention_mask = torch.LongTensor(data["attention_mask"])
        label = self.dataset.at[idx, "rating"]

        return (input_ids, attention_mask, token_type_ids,
                label), [txt, char_token, word_token]
def korean_morph(text):
    twitter = Twitter()

    s = twitter.morphs(str(unicode(text)))

    s = ' '.join(s)

    return s
示例#6
0
 def tokenizer(self, sentence, vocab_flag):  #@
     tw = Twitter()
     str = tw.morphs(sentence.strip())
     analysis = []
     for m in str:
         analysis.append(m)
     self.word_embed_dic.append(analysis)
     return [w for w in analysis if w]
def korean_morph(text):
    twitter = Twitter()
    
    s=twitter.morphs(str(unicode(text)))
    
    s=' '.join(s)
    
    
    
    return s
示例#8
0
def Calculate(input_string,S_DB):

	f = open(input_string,'r')
	line = f.readline()

	number_of_bad = S_DB['# of bad,good case'][0]
	number_of_good = S_DB['# of bad,good case'][1]

	sum_of_bad = 0
	sum_of_good = 0
	string_buf = []
	R_DB = []
	twitter = Twitter()
	prob_bad = math.log((number_of_bad/(number_of_good+number_of_bad))) # 전체 리뷰 중 부정적 리뷰의 비율(log scale)
	prob_good = math.log((number_of_good/(number_of_good+number_of_bad))) # 전체 리뷰 중 긍정적 리뷰의 비율(log scale)

	while True:
		line = f.readline()
		line_original = line.rstrip()
		if not line:
			break
		line = twitter.morphs(line)
		line.pop() #\n 제거

		string_buf = []
		sum_of_bad = 1
		sum_of_good = 1

		for j in range(1,len(line)):
			if line[j] in string_buf: # 한 리뷰에서 중복된 단어는 확률에 반영하지 않습니다.
				continue
			string_buf.append(line[j])
			buf = S_DB.get(line[j])

			if buf == None: # 사전에 있지 않은 단어는 무시합니다.
				continue

			p_bad = math.log(buf[0]+1) - math.log(number_of_bad)
			# 사전에 있는 단어 중 빈도수가 0인 경우 log scale에서 제대로 된 값이 나오지 않기 때문에 1을 더해줍니다.
			p_good = math.log(buf[1]+1) - math.log(number_of_good)
			sum_of_bad += p_bad
			sum_of_good += p_good

		sum_of_bad += prob_bad
		sum_of_good += prob_good
		if sum_of_bad > sum_of_good:
			R_DB.append([line_original,0])
		elif sum_of_good > sum_of_bad:
			R_DB.append([line_original,1])
		else: 
			R_DB.append([line_original,-1])

	f.close()

	return R_DB
def word_preprocessor(sent):
    twt = Twitter()
    sent = re.sub('[\,\<\>\(\)\+\-\=\&\@\#\$]', '', sent)
    sent = re.sub('\.{2,}', ' .. ', sent)
    sent = re.sub('\~+', ' ~ ', sent)
    sent = re.sub('\!+', ' ! ', sent)
    sent = re.sub('\?+', ' ? ', sent)
    sent = re.sub('(ac)', ' 99', sent)
    sent = re.sub('(mv)', ' 88', sent)
    sent = re.sub('ㅋ{1,}|ㅎ{1,}', 'ㅋ', sent)
    sent = re.sub('ㅜ{1,}|ㅠ{1,}|ㅠㅜ|ㅜㅠ\ㅡㅜ\ㅜㅡ\ㅡㅠ\ㅠㅡ', 'ㅠㅠ', sent)
    sent = " ".join(twt.morphs(sent))
    return sent
示例#10
0
def prepro_like_morphlized(data):
    # 형태소 분석 모듈 객체를 생성
    morph_analyzer = Twitter()

    # 형태소 토크나이즈 결과 문장을 받을 리스트를 생성합니다.
    result_data = list()
    # 데이터에 있는 매 문장에 대해 토크나이즈를 할 수 있도록 반복문을 선언합니다.
    for seq in tqdm(data):
        # Twitter.morphs 함수를 통해 토크나이즈 된 리스트 객체를 받고 다시 공백문자를 기준으로 문자열로 재구성 해줍니다.
        # morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', '')))
        morphlized_seq = " ".join(morph_analyzer.morphs(seq))  # 이게 훨씬 좋아보임.
        result_data.append(morphlized_seq)

    return result_data
示例#11
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
def regexp(texts):
    twt = Twitter()
    container = []
    for i, sent in enumerate(texts):
        if i % 200000 == 0:
            print(i)
        sent = re.sub('[\,\<\>\(\)\+\-\=\&\@\#\$]', '', sent)
        sent = re.sub('\.{2,}', ' .. ', sent)
        sent = re.sub('\~+', ' ~ ', sent)
        sent = re.sub('\!+', ' ! ', sent)
        sent = re.sub('\?+', ' ? ', sent)
        sent = re.sub('(ac)', ' 99', sent)
        sent = re.sub('(mv)', ' 88', sent)
        sent = re.sub('ㅋ{1,}|ㅎ{1,}', 'ㅋ', sent)
        sent = re.sub('ㅜ{1,}|ㅠ{1,}|ㅠㅜ|ㅜㅠ\ㅡㅜ\ㅜㅡ\ㅡㅠ\ㅠㅡ', 'ㅠㅠ', sent)
        container.append(" ".join(twt.morphs(sent)))
    return container
示例#13
0
def load_skt_nugu_samples(data_dir):
    answers = []
    questions = []
    vocab = set()
    twitter = Twitter()

    with open(os.path.join(data_dir, 'question_samples.txt')) as lines:
        for line in tqdm(lines, desc=lines.name, mininterval=0.5):
            key, value = line.strip().split('\t')

            tokens = twitter.morphs(value)
            questions.append(tokens)

            answers.append(key)

            vocab.update(tokens)
    return questions, answers, vocab
示例#14
0
def update_tokenize(petition_id, petition_content, tokenizing_status):
    tokenizing_status = get_crawled_status(petition_id)

    twitter = Twitter()  #for tokenizing made by twitter
    token_content = twitter.morphs(petition_content)
    token_content_str = ' '.join(token_content).strip()

    sql = "UPDATE simanalysis SET token = %s, tokenizing_status = %s where id = \"%s\""
    #UPDATE simanalysis SET token = "ddjdjdjdkak", tokenizing_status = 0 where id = 21

    try:
        curs.execute(sql, (token_content_str, tokenizing_status, petition_id))
        conn.commit()

    except:
        print("# update except!!")
        sys.exit()
示例#15
0
def insert_tokenize(petition_id, petition_content, tokenizing_status):
    tokenizing_status = get_crawled_status(petition_id)

    twitter = Twitter()  #for tokenizing made by twitter
    token_content = twitter.morphs(petition_content)
    token_content_str = ' '.join(token_content).strip()

    sql = "INSERT INTO simanalysis (id, token, tokenizing_status) VALUES (%s, %s, %s)"

    #print(sql, (petition_id, token_content_str, tokenizing_status))
    try:
        curs.execute(sql, (petition_id, token_content_str, tokenizing_status))
        conn.commit()

    except:
        print("# insert except!!")
        sys.exit()
示例#16
0
 def parse_file(lines, vocab):
     twitter = Twitter()
     data = dict()
     for i, line in tqdm(enumerate(lines), desc=lines.name,
                         mininterval=0.5):
         line = line.strip().split('\t')
         if len(line) is not 2:
             continue
         key, value = line
         if data.get(key) is None:
             data[key] = []
         if len(data[key]) == maximum:
             continue
         tokens = twitter.morphs(value)
         data[key].append(tokens)
         vocab.update(tokens)
     return data
示例#17
0
def preproLikeMorphlized(data):
    # 형태소 분석 모듈 객체를
    # 생성합니다.

    morphAnalyzer = Twitter()
    # 형태소 토크나이즈 결과 문장을 받을
    #  리스트를 생성합니다.
    result_data = list()
    # 데이터에 있는 매 문장에 대해 토크나이즈를
    # 할 수 있도록 반복문을 선언합니다.
    for seq in data:
        # Twitter.morphs 함수를 통해 토크나이즈 된
        # 리스트 객체를 받고 다시 공백문자를 기준으로
        # 하여 문자열로 재구성 해줍니다.
        morphlizedSeq = " ".join(morphAnalyzer.morphs(seq.replace(' ', '')))
        result_data.append(morphlizedSeq)

    return result_data
def main():
    with open(settings.VERSION_JSON, "r") as jsonFile:
        data = json.load(jsonFile)

    VERSION = data['version']

    with open("new_data.json", "r") as jf:
        dt = json.load(jf)

    text = dt['text']

    x_arr = []

    t = Twitter()
    vocab_fn = settings.VOCAB_FILENAME.format(VERSION)
    vocab_file = os.path.join(settings.DATA_DIR, vocab_fn)
    jobj = json.loads((open(vocab_file).read()))

    arr = list()
    tokens_ko = t.morphs(text)

    for word in tokens_ko:
        try:
            tmp = jobj[word]
            arr.append(tmp)
        except KeyError:
            pass

    temp_arr = np.asarray(arr)
    x_arr.append(temp_arr)

    x_test = np.asarray(x_arr, dtype=object)

    print('Pad sequences (samples x time)')
    x_test = sequence.pad_sequences(x_test, maxlen=settings.MAX_LENGTH)
    print('x_test shape:', x_test.shape)

    mod_load_fn = settings.MODEL_FILENAME.format(VERSION)
    mod_load_path = os.path.join(settings.OUTPUT_DIR, mod_load_fn)
    model = load_model(mod_load_path)

    classes = model.predict(x_test, batch_size=settings.BATCH_SIZE)
    print(classes)
示例#19
0
def get_tags(open_text_file):
    nlp = Twitter()
    nouns_list = []
    toekn_list = []
    i = 0
    for line in open_text_file:
        # for line in tqdm(open_text_file):
        print(line)
        text = line
        text = regex.sub(u"[\n]", " ", text)
        n = nlp.nouns(text)
        token = nlp.morphs(text)
        for value in n:
            nouns_list.append(value)
        for j in token:
            toekn_list.append(j)
        # if i == 400:
        #     break
        # else:
        #     i+=1
    return nouns_list, toekn_list
示例#20
0
def w2v(size):
    path = 'C:\\Users\\kwk51\\Desktop\\wikiextractor\\text\\'
    text_dir = ['AA', 'AB', 'AC']
    # text_dir = ['AA','AB','AC','AD','AE','AF']
    corpus = []
    for directory in text_dir:
        for file_num in range(size):
            wiki_object = open(path + directory + '\\wiki_' + str(file_num),
                               encoding="utf-8")

            for line in wiki_object:
                if line == '\n':
                    continue
                if "doc" in line:
                    continue
                corpus.append(line)

    twitter = Twitter()
    tokenized_list = []

    for token in corpus:
        tokenized_list.append(twitter.morphs(token))

    model = Word2Vec(tokenized_list,
                     size=100,
                     window=4,
                     min_count=10,
                     workers=1,
                     iter=100,
                     sg=1)
    # NN으로 되어있음.
    # sg = 1 skipgram 가운데 단어 양쪽 window 싸이드만큼 맞춰야되는 단어가 8개.
    # sg = 0 CBOW가 하나.

    model.save('w2v.model')

    # print(model.most_similar(positive=['여자','왕'], negative = ['남자'], topn=30))
    # print(model.similarity('남자','여자'))

    return model.most_similar(positive=['여자', '왕'], negative=['남자'], topn=30)
示例#21
0
def dobby():
    query = request.values.get('query', 'default')
    conn = pymysql.connect(host=info.host,
                           user=info.user,
                           password=info.password,
                           db=info.db,
                           charset=info.charset)
    curs = conn.cursor()

    tagger = Twitter()
    query = tagger.morphs(query)
    query = [' '.join(query)]

    sql = "SELECT * FROM dobby;"
    curs.execute(sql)
    rows = curs.fetchall()
    rows = list(rows)

    # TF-IDF
    data = pd.DataFrame(rows,
                        columns=['id', 'question', 'prep_question', 'answer'])
    prep_question = list(data["prep_question"])
    prep_question = query + prep_question

    tfidfv = TfidfVectorizer().fit(prep_question)
    tf_idf_mat = tfidfv.transform(prep_question).toarray()

    doc0 = np.array(tf_idf_mat[0])
    tf_idf_mat = tf_idf_mat[1:]
    max_similarity = -1
    max_similarity_index = -1
    for idx, doc in enumerate(tf_idf_mat):
        doc = np.array(doc)
        if max_similarity < cos_sim(doc0, doc):
            max_similarity = cos_sim(doc0, doc)
            max_similarity_index = idx

    result = rows[max_similarity_index][3]
    return result
示例#22
0
class Tokenizer:

    def __init__(self):
        self.t = Twitter()
        pass;
        
    def tokenize(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        token_list = []

        for num, input in enumerate(token):
            if (token[num] in scores) == True:
                token_list.append(token[num])
            elif (token[num] in scores) == False:
                twit_token = self.t.morphs(token[num])
                token_list= token_list + twit_token

        return token_list
    
    def noun_extract(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        noun_list = []
        compared_noun_list = self.t.nouns(sentence)

        for num, input in enumerate(token):
            if (token[num] in scores) == True:
                noun_list.append(token[num])
            elif (token[num] in scores) == False:
                twit_token = self.t.nouns(token[num])
                noun_list= noun_list + twit_token
        
        diff_noun_list = list(set(noun_list) - set(compared_noun_list))
        diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys()))
        
        for num, input in enumerate(noun_list):
            if (noun_list[num] in diff_noun_list) == True:
                noun_list.pop(num);
                
        return noun_list 

    def noun_extract_dup(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        noun_list = []
        compared_noun_list = self.t.nouns(sentence)
        
        for num, input in enumerate(token):
            if (token[num] in scores) == True:
                noun_list.append(token[num])
            elif (token[num] in scores) == False:
                twit_token = self.t.nouns(token[num])
                noun_list= noun_list + twit_token
        
        diff_noun_list = list(set(noun_list) - set(compared_noun_list))
        diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys()))

        noun_list = list(set(noun_list) - set(diff_noun_list))
        return noun_list
    
    def noun_counter(self, sentence, score_dic, word):
        noun_list = self.noun_extract(sentence,score_dic)
        number = 0
        for num, input in enumerate(noun_list):
            if input == word:
                number = number + 1
        
        return number
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from konlpy.corpus import kobill
from konlpy.tag import Twitter

t = Twitter()
fields_ko = kobill.fileids()
docs_ko = kobill.open('1809890.txt').read()
tokens_ko = t.morphs(docs_ko)
print(isinstance(tokens_ko, list))
print(tokens_ko)

embedding = word2vec.Word2Vec(tokens_ko,
                              size=5,
                              window=1,
                              negative=3,
                              min_count=1)

# token으론 잘 짤리는 데 왜 한글자로 저장되는지 모르겠음
embedding.wv.save_word2vec_format('my.sample', binary=False)

model = KeyedVectors.load_word2vec_format('my.sample',
                                          binary=False,
                                          encoding='utf-8')

print(model.most_similar('육'))
示例#24
0
from konlpy.tag import Twitter # Twitter starts with capital T, so it is a class
twitter = Twitter()
maillist = twitter.pos("고구려의 영역은 어디까지일까", norm=True, stem=True)
print(maillist)

print(twitter.morphs(u' ')) 
# u' ' means string is unicode, but no longer used in Python3 as it is automatic

print(twitter.morphs(' 정부와 기업이 함께 근로자의 휴가비를 지원 ')) # slices words into each word
print(twitter.nouns('직장 내 자유로운 휴가 분위기를 조성하고 일과 휴식이 균형을 이루는 근무 여건을 만들기 위해 지난해부터 시행한 사업이다. ')) # extracts nouns
print(twitter.pos('이것도 되나욬ㅋㅋㅋ')) # each word and its category
print(twitter.pos('이것도 되나욬ㅋㅋㅋ', norm=True)) # norm option converts to nomral words
print(twitter.pos('이것도 되나욬ㅋㅋㅋ', norm=True, stem=True)) # stem option converts words to root words
# Last statement is most frequently used
示例#25
0
word_set = set()
word_dict = dict()

count = 0

requester = Requester(url, limit=1000)
res = requester.next()

while res is not None:
    for data in res:
        song_info = data['song_info']

        lyric = song_info['lyric']

        split_lyric = twitter.morphs(lyric)

        word_set.update(split_lyric)

        for _str in split_lyric:
            if _str not in word_dict:
                word_dict[_str] = 1
            else:
                word_dict[_str] += 1
        count += 1

    print('[{}] word_set : {}'.format(count, len(word_set)))
    res = requester.next()

print(word_set)
示例#26
0
from konlpy.tag import Twitter
twitter = Twitter()
a = '책가방을메고 학교에가서 공부를 하자'
b = '우리나라 금수강산 이라서 그런가 금수저가 많네요'
mo = twitter.morphs(a)
mo2 = twitter.morphs(b)
po = twitter.pos(a)

ta = []
ta.append(mo)
ta.append(mo2)

print(ta)
print(len(ta))

for word, tag in ta:
    print(word)
    print(tag)
示例#27
0
'''
Created on 2018. 1. 28.

@author: hillk
'''
import sys
from konlpy.tag import Twitter

twitter = Twitter()

print('한글 문장을 입력하세요.')

try:
    while True:
        sys.stdout.write('>> ')
        sys.stdout.flush()
        text = sys.stdin.readline().strip()
        if text:
            answer = twitter.morphs(text)
            print(answer)
except KeyboardInterrupt:
    pass
示例#28
0
from konlpy.utils import pprint
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize

twitter = Twitter()
stop_words = set(stopwords.words('korean'))
stop_words.update(
    ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{',
     '}'])  # remove it if you need punctuation

#TEXT = u'S8에서 Bixby가 왜 광고에 나오는 것처럼 추천기능이 작동하지 않는거니'
TEXT = u'갤럭시에서 Bixby 기능이 왜 광고에 나오는 것처럼 추천기능이 작동하지 않는거니'

print("=== Parse phrase(tokenize) to morphemes. ===")
print(twitter.morphs(TEXT))
#print ("=== Nouns extract ===")
#print(twitter.nouns(TEXT))
#print ("=== Phrase extract ===")
#print(twitter.phrases(TEXT))

print("=== Post tagger ===")
print(twitter.pos(TEXT))

print("=" * 10 + ' Remove stopwords(& lower) Test: ' + "=" * 10)
print([(i.lower(), j) for i, j in twitter.pos(TEXT)
       if i.lower() not in stop_words])

print("=== Post tagger with normalization & stemming(lemmatization) ===")
print([(i.lower(), j) for i, j in twitter.pos(TEXT, norm=True, stem=True)
       if i.lower() not in stop_words])
示例#29
0
#-*- coding: utf-8 -*-
from konlpy.tag import Twitter
import nltk

import sys

reload(sys)
sys.setdefaultencoding('utf-8')

twitter = Twitter()

print(twitter.morphs(u'한글형태소분석기 테스트 중 입니다'))  # ??
print(twitter.nouns(u'한글형태소분석기 테스트 중 입니다!'))  #명사
print(twitter.pos(u'한글형태소분석기 테스트 중 입니다.'))  #형태소


def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data


def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]


def term_exists(doc):
    return {
        'exists({})'.format(word): (word in set(doc))
        for word in selected_words
            if diffchar(l[idx - 1], l[idx]):
                l.insert(idx, ' ')
            idx += 1

        l = (''.join(l)).split()

        # 소문자로 바꾸기
        l = [c.lower() for c in l]
        # 사전에 있는 영단어만 남기기
        l = [c for c in l if (iskor(c[0]) or (c in eng))]

        # konlpy
        idx = 0
        while idx < len(l):
            if iskor(l[idx][0]):
                sp = twitter.morphs(l[idx])
                if idx == 0: l = sp + l[idx + 1:]
                else: l = l[:idx] + sp + l[idx + 1:]
                idx += len(sp)
            else:
                idx += 1
        # konlpy

        result = ' '.join(l) + '\n'
        # preprocess failed
        if len(result) == 1:
            result = line.split('\x01')[2]
        f.write(result)

        cnt += 1
        if cnt % 100000 == 0:
示例#31
0
def Learn(input_string):

	f = open(input_string,'r')
	line = f.readline()
	T_DB = []
	S_DB = {}  # 리턴할 Dictionary
	number_of_bad = 0 # 부정적 반응의 리뷰 개수
	number_of_good = 0 # 긍정적 반응의 리뷰 개수
	string_buf = []
	twitter = Twitter()

	while True:
		line = f.readline()
		if not line:
			break
		line = twitter.morphs(line)
		line.pop()	# '\n' 제거
		line[0] = line.pop() #'긍정/부정 점수'를 리스트의 가장 앞쪽으로 옮김
		string_buf = []
		if line[0] == '0':
			number_of_bad += 1
		else:
			number_of_good += 1
		'''
		형태소를 T_DB에 임시로 등록하는 과정, [형태소,부정여부,긍정여부]로 등록.
		ex) '아름다움'이 긍정적 평가에 나타난 경우 ['아름다움',0,1]로 등록
		'''

		for i in range(1,len(line)):
			if line[i] in string_buf: # 하나의 리뷰에서 2개 이상 나오는 단어는 중복하여 사전에 등재하지 않습니다.
				continue
			string_buf.append(line[i])
			if line[0] == '0':
				T_DB.append([line[i],1,0])
			else:
				T_DB.append([line[i],0,1])

	T_DB.sort(reverse=True)
	''' S_DB에 부정적 리뷰 갯수, 긍정적 리뷰 갯수를 먼저 등록함 '''
	S_DB['# of bad,good case'] = [number_of_bad,number_of_good]
	len_db = len(T_DB)
	bad = 0
	good = 0
	string = T_DB[1][0]

	''' T_DB에 중복으로 등록된 형태소들의 갯수를 각각 합하여 하나로 등록한 뒤 S_DB에 등록하는 과정 '''
	for i in range(1,len_db-1):
		buf = T_DB[i][0]
		if string == buf:
			bad += T_DB[i][1]
			good += T_DB[i][2]
			if i != len_db-1:
				continue
		else:
			S_DB[string] = [bad,good]
			string = buf
			bad = T_DB[i][1]
			good = T_DB[i][2]

		if i == len_db-1:
			if bad != 0 and good != 0:
				S_DB[string] = [bad,good]

	f.close()
	return S_DB
#!usr/bin/env python
# File name....: text-analysis-2.py
# Module name..: 4 - Text Analysis & Visualization
# Author.......: Buomsoo Kim, Jinsoo Park
'''
This program demonstrates how to analyze text. It uses Twitter in konlpy module,
and handles the moview review full data collected during Module 3.

This file is provided for educational purposed and can be distributed for class use.
Copyright (c) by Jinsoo Park, Seoul National University. All rights reserved.
'''

# 한국어 텍스트 분석에 필요한 모듈 불러오기(konlpy)
from konlpy.tag import Twitter  # 트위터 형태소 분석기 불러오기

# 분석에 앞서, 크롤링했던 영화 리뷰 데이터 불러오기
file = open('data-full.txt', 'r', encoding='utf-8')  # 영화 리뷰 파일 열기
lines = file.readlines()  # readlines()를 통해 영화 리뷰 파일의 모든 라인 읽어오기
file.close()  # 파일 닫기

#print(lines)

# 트위터 변수 생성
twitter = Twitter()

# 문장을 형태소 분석하기
tokens = twitter.morphs(lines[1])  # 첫번째 문장의 형태소 분석
print(tokens)  # 형태소 분석 결과 출력하기

# ///// END OF text-analysis-2 ////////////////////////////////////////////////