def __init__(self, userdic=None): from konlpy.tag import Komoran import os if userdic is not None: print("user dict " + str(os.path.abspath(userdic))) self.inst = Komoran(userdic=os.path.abspath(userdic)) else: self.inst = Komoran() self.OUT_TYPE = [list, tuple]
def __init__(self, filepath, tagger=None): if tagger: self.tagger = tagger else: self.tagger = Komoran() self.filepath = filepath self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def main(unused_argv): if len(sys.argv) == 1: flags._global_parser.print_help() sys.exit(0) # Loading model m = model.load_model(FLAGS.dragnn_spec, FLAGS.resource_path, FLAGS.checkpoint_filename, enable_tracing=FLAGS.enable_tracing, tf_master=FLAGS.tf_master) sess = m['session'] graph = m['graph'] builder = m['builder'] annotator = m['annotator'] # Analyze # Prepare korean morphological analyzer for segmentation from konlpy.tag import Komoran komoran = Komoran() startTime = time.time() while 1: try: line = sys.stdin.readline() except KeyboardInterrupt: break if not line: break line = line.strip() if not line: continue segmented, tagged = model.segment_by_konlpy(line, komoran) # ex) line = '제주 로 가다 는 비행기 가 심하다 는 비바람 에 회항 하 었 다 .' line = ' '.join(segmented) parsed_sentence = model.inference(sess, graph, builder, annotator, line, FLAGS.enable_tracing) out = model.parse_to_conll(parsed_sentence, tagged) f = sys.stdout f.write('# text = ' + line.encode('utf-8') + '\n') for entry in out['conll']: id = entry['id'] form = entry['form'] lemma = entry['lemma'] upostag = entry['upostag'] xpostag = entry['xpostag'] feats = entry['feats'] head = entry['head'] deprel = entry['deprel'] deps = entry['deps'] misc = entry['misc'] li = [ id, form, lemma, upostag, xpostag, feats, head, deprel, deps, misc ] f.write('\t'.join([str(e) for e in li]) + '\n') f.write('\n\n') durationTime = time.time() - startTime sys.stderr.write("duration time = %f\n" % durationTime) # Unloading model model.unload_model(m)
def __init__(self): self.komoran = Komoran() self.kkma = Kkma() self.hann = Hannanum() self.mecab = Mecab() self.twitter = Twitter() self.okt = Okt()
def make_news_contents(search_word, news_links): komoran = Komoran() news_contents = [] stopwords = ['하', '있', '없', '되', '보'] for news_link in news_links: article = Article(news_link, language='ko') try: article.download() article.parse() content = article.text except ArticleException: continue news_content = "" word_tag = komoran.pos(content) for word, morph in word_tag: if word not in stopwords and word not in search_word: if morph in ['VA', 'VV']: news_content += (word + '다' + ' ') elif morph in ['NNP', 'NNG', 'NP'] and len(word) > 1: news_content += (word + ' ') news_contents.append(news_content) return news_contents
def sentences_komoran(filelist): komoran = Komoran() sentences = [] for i, file in enumerate(filelist): with open(file, 'r', encoding='utf-8') as fp: while True: try: line = fp.readline() if not line: break line = re.sub("\xa0", " ", line).strip() if line == "" : continue tokens = komoran.nouns(line) if len(tokens) == 0: continue sentences.append(tokens) except Exception as e: print(e) continue return sentences
def __init__(self): settings = dict( static_path = os.path.join(os.path.dirname(__file__), 'static'), template_path = os.path.join(os.path.dirname(__file__), 'templates'), autoescape = None, debug = options.debug, gzip = True ) handlers = [ (r'/', IndexHandler), (r'/_hcheck.hdn', HCheckHandler), (r'/dragnn', DragnnHandler), (r'/dragnntest', DragnnTestHandler), ] tornado.web.Application.__init__(self, handlers, **settings) autoreload.add_reload_hook(self.finalize) self.log = setupAppLogger() ppid = os.getpid() self.log.info('initialize parent process[%s] ...' % (ppid)) self.ppid = ppid self.enable_tracing = options.enable_tracing # import konlpy if enabled self.enable_konlpy = options.enable_konlpy self.komoran = None if options.enable_konlpy : from konlpy.tag import Komoran komoran = Komoran() self.komoran = komoran self.log.info('initialize parent process[%s] ... done' % (ppid)) log.info('start http start...')
def __init__(self): self.mongodb = MongoDBHandler() self.komoran = Komoran() self.rdate = datetime.today().strftime("%Y%m%d") self.articles = [] self.font_path = '../DownloadLib/NanumFont_TTF_ALL/NanumGothic.ttf' self.code_list = []
def accuracy(name): reslut = [] if name == 'kma': mode = Kkma() elif name == 'okt': mode = Okt() elif name == 'komoran': mode = Komoran() else : return 0 mylin = input ("문장을 입력해 주세요: " ) print("형태소분석기",name,"정확도 분석을 시작합니다. ") print('\n') acc = mode.morphs(mylin) # 입력문장 형태소 분석 for sentence in texts: arr.append(sentence) sp_text = mode.morphs(sentence) # 한줄씩 문장별로 잘라서 형태소 분석 Jaccard_similarty(acc,sp_text) # 자칼드 유사도로 유사도 계산 n = 5 Sortsimilarty = sorted(range(len(similarty)),key=lambda i: similarty[i], reverse=True)[:n] # 결과를 sort 해줍니다. k = 0 for i in Sortsimilarty: k = k+1 print( k ,"번째로 유사도가 높은 문장입니다. : ",arr[i],"유사도는 다음과 같습니다. : ", similarty[i] ) print('\n') Sortsimilarty = [] similarty = []
def __init__(self, args, konlpy_type='mecab', verbose=True): # Token setting self.pad_idx = args.pad_idx self.bos_idx = args.bos_idx self.eos_idx = args.eos_idx self.unk_idx = args.unk_idx self.sep_idx = args.sep_idx # Path setting self.save_path = args.save_path # Training setting self.vocab_size = args.vocab_size self.verbose = verbose # API setting if konlpy_type.lower() == 'mecab': self.api = Mecab() elif konlpy_type.lower() == 'okt': self.api = Okt() elif konlpy_type.lower() == 'hannanum': self.api = Hannanum() elif konlpy_type.lower() == 'kkma': self.api = Kkma() elif konlpy_type.lower() == 'komoran': self.api = Komoran() else: raise Exception('Not supported konlpy parser')
def main(): twitter = Twitter() komoran = Komoran() mecab = Mecab() argv_dict = {'twitter': twitter, 'komoran': komoran, 'mecab': mecab} if len(sys.argv) < 2: print('please insert sys_argv \n', '1) tokenizer selection: twitter, komoran, mecab \n', '2) twitter_tokenizer_option: norm [no argv means True] \n', '3) twitter_tokenizer_option: stemming [no argv means True]') else: output_file_path = input('output_text_file_name: ') output_file_path = './data/' + output_file_path input_file_path = input('input_text_file_name: ') input_file_path = './data/' + input_file_path twitter_option = [bool(sys.argv[2]), bool(sys.argv[3])] if len( sys.argv) == 4 else [True, True] app_id_list, app_name_list, cate_list, rating_list, review_list = read_jsonl( input_file_path, key_ma=False) ma_list = [] for review in tqdm(review_list, desc='tokenizing', total=len(review_list)): ma_tokens = get_pos(tokenizer=argv_dict[sys.argv[1]], doc=review, twi_norm=twitter_option[0], twi_stem=twitter_option[1]) ma_list.append(ma_tokens) save_jsonl(output_file_path, app_id_list, app_name_list, cate_list, rating_list, ma_list)
def get_tokenizer(name: str, pos: bool=False): if name == "komoran": komoran = Komoran() if pos: tokenizer = komoran.pos else: tokenizer = komoran.morphs elif name == "okt": okt = Okt() if pos: tokenizer = okt.pos else: tokenizer = okt.morphs elif name == "mecab": mecab = Mecab() if pos: tokenizer = mecab.pos else: tokenizer = mecab.morphs elif name == "hannanum": hannanum = Hannanum() if pos: tokenizer = hannanum.pos else: tokenizer = hannanum.morphs elif name == "kkma": kkma = Kkma() if pos: tokenizer = kkma.pos else: tokenizer = kkma.morphs else: tokenizer = lambda x : x.split() return tokenizer
def __init__(self, tagger=None): if tagger: self.tagger = tagger else: from konlpy.tag import Komoran self.tagger = Komoran() self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def __init__(self, pos_tagger_name='mecab', mecab_path='', exceptional_stop_pos=[], lang='ko', stopwords=[]): self.stop_pos = STOP_POS[lang] # print(self.stop_pos) if pos_tagger_name == 'mecab': from konlpy.tag import Mecab self.pos_tagger = Mecab(mecab_path) elif pos_tagger_name == 'komoran': from konlpy.tag import Komoran self.pos_tagger = Komoran() elif pos_tagger_name == 'nltk': self.pos_tagger = None else: from konlpy.tag import Okt self.pos_tagger = Okt() if not exceptional_stop_pos: self.stop_pos = [ x for x in self.stop_pos if x not in exceptional_stop_pos ] self.stopwords = [] if stopwords: self.stopwords = stopwords self.graph = nx.diamond_graph() self.graph.clear() # 처음 생성시 graph에 garbage node가 남아있어 삭제 self.tokens = []
def create_dict(post_list): dict_list = [] for query in post_list: text = "" post_num = 0 for post in query: #if post we are looking at has any text content associated with it. if (len(post) > 2): post_num += 1 text += post[2] #extract nouns nouns = Komoran().nouns(text) count_dict = defaultdict(int) #count the occurance of each noun. for noun in nouns: count_dict[noun] += 1 dict_list.append(count_dict) #for each noun, if that noun appeared less than 10% of the times, delete. for i in range(len(dict_list)): for item in list(dict_list[i]): if dict_list[i][item] < post_num / 10: del dict_list[i][item] return dict_list
def tokenize(txt): tokens = Komoran().morphs(txt) hangul = re.compile('[^\uac00-\ud7a3]+') stpwrds = load_ko_stopwords("ko_stopwords.txt") tokens = [hangul.sub('', i) for i in tokens] tokens = [i for i in tokens if len(i) > 0 and i not in stpwrds] return tokens
def load_w2v(section, target, max_n, season='all'): model = word2vec.Word2Vec.load(f'{save_dir}/{section}_{season}.model') li = model.wv.most_similar(positive=[target], topn=max_n) komoran = Komoran() word_list = [] dist_list = [] for word, dist in li: temp = [tt[1] for tt in komoran.pos(word)] if len(set(temp).intersection(["NNG", "NNP"])) != 0: word_list.append(word) dist_list.append(dist) # 가까운 단어와 거리를 csv로 저장 df = pd.DataFrame({'word': word_list, 'dist': dist_list}) df.to_csv(f'{save_dir}/{section}_{target}_{season}.csv', encoding='ms949') # 그림으로 저장 word_list.append('미세먼지') x = model[word_list] tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2000) x_tsne = tsne.fit_transform(x) df2 = pd.DataFrame(x_tsne, index=word_list, columns=['x', 'y']) plt.figure(figsize=(16, 9)) plt.scatter(df2['x'], df2['y']) for word, pos in df2.iterrows(): if word == '미세먼지': plt.annotate(word, pos, color='red') else: plt.annotate(word, pos, va='bottom') plt.savefig(f'{save_dir}/{section}_{target}_{season}.png') plt.close()
def evaluate_cli(model, context_embeddings_op, elmo_context, elmo_ids): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) if FLAGS.task == "NER": with open(FLAGS.necessary, "rb") as f: word_to_id, id_to_word, char_to_id, id_to_char, pumsa_to_id, id_to_pumsa, tag_to_id, id_to_tag, ner_morph_tag = pickle.load( f) komoran = Komoran() results = [] while True: # line = input("문장을 입력하세요.:") line = [ "찬민이의 멘탈이 산산조각났습니다.", "진짜 진짜 진짜 맛있는 진짜 라면", "집에 가고 싶읍니다.", "집", "가 가 가 가 가 가, 가, 가 ,가, 가 가 가 가 가 가, 가, 가 ,가 !!!!! ." ] for idx in range(0, len(line), 5): l = line[idx:idx + 2] results.extend( model.evaluate_lines( sess, context_embeddings_op, elmo_context, elmo_ids, ner_morph_tag, inputs_from_sentences(komoran, l, word_to_id, pumsa_to_id, char_to_id, elmo_dict, FLAGS.max_char_length, ner_morph_tag), id_to_tag)) print(results)
def __init__(self, keyword,channel, startDate, endDate, nlpEngine, nUrl): self.keyword = keyword self.channel = channel self.startDate = startDate self.endDate = endDate self.nlpEngine = nlpEngine self.nUrl = nUrl if nlpEngine == "Okt": self.konlpy = Okt() elif nlpEngine == "Komoran": self.konlpy = Komoran() elif nlpEngine == "Kkma": self.konlpy = Kkma() elif nlpEngine == "Hannanum": self.konlpy = Hannanum() elif nlpEngine == "Mecab": self.konlpy = Mecab() elif nlpEngine == "Twitter": self.konlpy = Twitter() else: self.konlpy = Okt()
def noun_tokenizer(df_col_name, preprocessed_df): from konlpy.tag import Komoran komoran = Komoran() globals()["noun_" + str(df_col_name)] = [] for words in preprocessed_df: globals()["noun_" + str(df_col_name)].append(komoran.nouns(words)) print(globals()["noun_" + str(df_col_name)])
def konlpykomo(inputSentence: str, sentenceList: list) -> dict: komo = Komoran() sentenceDict = dict() inputPos = komo.pos(inputSentence) inputPosCount = Counter(inputPos) inputLen = len(inputPosCount) for line in sentenceList: if line == '': continue sentencePos = komo.pos(line) sentencePosCount = Counter(sentencePos) sentenceLen = len(sentencePosCount) if sentenceLen > inputLen: common = 0 for morpheme in inputPosCount: if morpheme in sentencePosCount: common += min(inputPosCount[morpheme], sentencePosCount[morpheme]) similarity = 100 * common / inputLen sentenceDict[line] = similarity else: common = 0 for morpheme in inputPosCount: if morpheme in sentencePosCount: common += min(inputPosCount[morpheme], sentencePosCount[morpheme]) similarity = 100 * common / sentenceLen sentenceDict[line] = similarity return sentenceDict
def parse(text): """ Returns a list of parsed nouns from [text] using Komoran parser """ komoran = Komoran() nouns = komoran.nouns(text) return nouns
def preparation(self, flag): self.flag = flag data = pd.read_csv('./something.csv', encoding='utf-8') #csv data load data_h = pd.read_csv('./something_h.csv', encoding='utf-8') #csv data load print(data.head()) #show data top 5 # print(data.shape) # print(data_h.shape) data_list = data.values.tolist() #pandas to list data_list_h = data_h.values.tolist() #pandas to list #choice the tokenizer # flag = 1 if flag == 0: tagger = Komoran() elif flag == 1: tagger = Mecab() x_data = data['content'] #assign data with column y_data = data['intent'] #assign data with column x_data_h = data_h['content'] #assign data with column y_data_h = data_h['intent'] #assign data with column return x_data, x_data_h, y_data, y_data_h, tagger
def control(input_msg): tagger = Komoran() dataset = Dataset('nsmc/ratings.txt', tagger, max_length=MAX_LENGTH, batch_size=BATCH_SIZE) Z_DIM = 40 H_DIM = 300 C_DIM = 2 model = RNN_VAE(dataset.num_words, H_DIM, Z_DIM, C_DIM, freeze_embeddings=False, gpu=USE_CUDA, gpu_id=GPU_ID) test_data = torch.LongTensor( dataset.sentence2idxs(tagger.morphs(input_msg))).unsqueeze(1) model.load_state_dict(torch.load('models/vae_epoch_300_400.bin')) results = model.controlSentence(test_data, t=0.5) return (dataset.idxs2sentence(results[0], no_pad=True), dataset.idxs2sentence(results[1], no_pad=True))
def words_check(request): # 필요한 라이브러리 및 변수 초기화 data = request.POST.get('data') komoran = Komoran() words = Counter(komoran.nouns(data)) # 1글자 단어 걸러내기 nouns = dict() for data in words.keys(): if len(data) != 1: nouns[data] = words.get(data) nouns = sorted(nouns.items(), key=lambda x: x[1], reverse=True) hashing = random.choice(range(100)) context = { 'nouns': nouns, 'hashing': hashing, } # 워드클라우드 taglist = pytagcloud.make_tags(nouns, minsize=10, maxsize=60) link = 'static/wordcloud/wordcloud' + str(hashing) + '.jpg' #link = 'static/wordcloud/wordcloud.jpg' pytagcloud.create_tag_image(taglist, link, size=(600, 600), layout=3, fontname='CookieRun', rectangular=True) return HttpResponse(json.dumps(context), content_type='application/json')
def main(base_path, pkl_lst): DataFrame = preprocess(base_path, pkl_lst) print('Spacing the document...') DataFrame = multicore_cpu(DataFrame, spacing_doc, n_cores=args.cpu_core, spell=False) print('Spell checking...') checked_data = multicore_cpu(DataFrame, spell_check, n_cores=args.cpu_core, spell=True) checked_data.reset_index(drop=True, inplace=True) # tokenizing print('Tokenizing the document...') komoran = Komoran(userdic=args.token_dict) checked_data['tokenized_contents'] = checked_data['contents'].apply( lambda x: komoran.morphs(x)) # filter documents checked_data['doc_length'] = checked_data['tokenized_contents'].apply( lambda x: len(x)) final_data = checked_data.loc[checked_data['doc_length'] > args.token_cnt] final_data.reset_index(drop=True, inplace=True) # save the output data os.makedirs(args.save_path, exist_ok=True) with open(os.path.join(args.save_path, 'preprocessed_data.pickle'), 'wb') as f: pickle.dump(final_data, f)
def main(corpora, output): filelist = os.listdir(corpora) tagger_stan = Komoran() tagger_jeju = Komoran(userdic='userdic.txt') # TODO: If not userdic for file in filelist: book = openpyxl.load_workbook(os.path.join(corpora, file)) sheet = book.get_sheet_by_name("Sheet") tagged = (bool(sheet.cell(row=1, column=3).value) and bool(sheet.cell(row=1, column=4).value)) if not tagged: for sample in sheet.rows: index = sample[0].row try: stan = sample[0].value pos_stan = ' '.join(tagger_stan.morphs(stan)) jeju = sample[1].value pos_jeju = ' '.join(tagger_jeju.morphs(jeju)) except: continue else: sheet.cell(row=index, column=3).value = pos_stan sheet.cell(row=index, column=4).value = pos_jeju book.save(os.path.join(corpora, file)) filename = file[:file.find('.')] if not os.path.exists(output): os.makedirs(output) output_dir = os.path.join(output, filename + '.txt') # Exception: Output Dir not Exists output_file = open(output_dir, 'w') for sample in sheet.rows: try: line = '\t'.join([ s.value for s in sample[:5] ]) + '\n' # Exception: s.value can be no string except TypeError: continue else: output_file.write(line) output_file.close() book.close()
def parse_sentence_pos(line): komoran = Komoran() idx, raw, label = line.split('\t') pos = "" for elem in komoran.pos(line): pos += elem[0] + '/' + elem[1] + '|' pos = pos[-1] return idx, pos, raw, label
def __init__(self, file_path, tagger=None): if tagger: self.tagger = tagger else: from konlpy.tag import Komoran self.tagger = Komoran(userdic='./text_rank/dic.txt') self.file_path = file_path self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def __init__(self, textIter, tagger=None): if tagger: self.tagger = tagger else: self.tagger = Komoran() if type(textIter) == str: self.textIter = textIter.split('\n') else: self.textIter = textIter self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')