예제 #1
0
 def __init__(self, userdic=None):
     from konlpy.tag import Komoran
     import os
     if userdic is not None:
         print("user dict " + str(os.path.abspath(userdic)))
         self.inst = Komoran(userdic=os.path.abspath(userdic))
     else:
         self.inst = Komoran()
     self.OUT_TYPE = [list, tuple]
예제 #2
0
파일: views.py 프로젝트: exid0429/apactory
 def __init__(self, filepath, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         self.tagger = Komoran()
     self.filepath = filepath
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def main(unused_argv):

    if len(sys.argv) == 1:
        flags._global_parser.print_help()
        sys.exit(0)

    # Loading model
    m = model.load_model(FLAGS.dragnn_spec,
                         FLAGS.resource_path,
                         FLAGS.checkpoint_filename,
                         enable_tracing=FLAGS.enable_tracing,
                         tf_master=FLAGS.tf_master)
    sess = m['session']
    graph = m['graph']
    builder = m['builder']
    annotator = m['annotator']

    # Analyze
    # Prepare korean morphological analyzer for segmentation
    from konlpy.tag import Komoran
    komoran = Komoran()
    startTime = time.time()
    while 1:
        try:
            line = sys.stdin.readline()
        except KeyboardInterrupt:
            break
        if not line: break
        line = line.strip()
        if not line: continue
        segmented, tagged = model.segment_by_konlpy(line, komoran)
        # ex) line = '제주 로 가다 는 비행기 가 심하다 는 비바람 에 회항 하 었 다 .'
        line = ' '.join(segmented)
        parsed_sentence = model.inference(sess, graph, builder, annotator,
                                          line, FLAGS.enable_tracing)
        out = model.parse_to_conll(parsed_sentence, tagged)
        f = sys.stdout
        f.write('# text = ' + line.encode('utf-8') + '\n')
        for entry in out['conll']:
            id = entry['id']
            form = entry['form']
            lemma = entry['lemma']
            upostag = entry['upostag']
            xpostag = entry['xpostag']
            feats = entry['feats']
            head = entry['head']
            deprel = entry['deprel']
            deps = entry['deps']
            misc = entry['misc']
            li = [
                id, form, lemma, upostag, xpostag, feats, head, deprel, deps,
                misc
            ]
            f.write('\t'.join([str(e) for e in li]) + '\n')
        f.write('\n\n')
    durationTime = time.time() - startTime
    sys.stderr.write("duration time = %f\n" % durationTime)

    # Unloading model
    model.unload_model(m)
예제 #4
0
 def __init__(self):
     self.komoran = Komoran()
     self.kkma = Kkma()
     self.hann = Hannanum()
     self.mecab = Mecab()
     self.twitter = Twitter()
     self.okt = Okt()
예제 #5
0
def make_news_contents(search_word, news_links):
    komoran = Komoran()
    news_contents = []
    stopwords = ['하', '있', '없', '되', '보']

    for news_link in news_links:
        article = Article(news_link, language='ko')
        try:
            article.download()
            article.parse()
            content = article.text
        except ArticleException:
            continue

        news_content = ""
        word_tag = komoran.pos(content)

        for word, morph in word_tag:
            if word not in stopwords and word not in search_word:
                if morph in ['VA', 'VV']:
                    news_content += (word + '다' + ' ')
                elif morph in ['NNP', 'NNG', 'NP'] and len(word) > 1:
                    news_content += (word + ' ')

        news_contents.append(news_content)

    return news_contents
def sentences_komoran(filelist):
    komoran = Komoran()
    sentences = []
    for i, file in enumerate(filelist):
        with open(file, 'r', encoding='utf-8') as fp:
            while True:
                try:
                    line = fp.readline()
                    if not line: 
                        break

                    line = re.sub("\xa0", " ", line).strip()
                    if line == "" : 
                        continue

                    tokens = komoran.nouns(line)
                    if len(tokens) == 0: 
                        continue

                    sentences.append(tokens)

                except Exception as e:
                    print(e)
                    continue
    return sentences
예제 #7
0
	def __init__(self):
		settings = dict(
			static_path = os.path.join(os.path.dirname(__file__), 'static'),
			template_path = os.path.join(os.path.dirname(__file__), 'templates'),
			autoescape = None,
			debug = options.debug,
			gzip = True
		)

		handlers = [
			(r'/', IndexHandler),
			(r'/_hcheck.hdn', HCheckHandler),
			(r'/dragnn', DragnnHandler),
			(r'/dragnntest', DragnnTestHandler),
		]

		tornado.web.Application.__init__(self, handlers, **settings)
		autoreload.add_reload_hook(self.finalize)

		self.log = setupAppLogger()
		ppid = os.getpid()
		self.log.info('initialize parent process[%s] ...' % (ppid))
		self.ppid = ppid
		self.enable_tracing = options.enable_tracing
		# import konlpy if enabled
		self.enable_konlpy = options.enable_konlpy
		self.komoran = None
		if options.enable_konlpy :
			from konlpy.tag import Komoran
			komoran = Komoran()
			self.komoran = komoran
		self.log.info('initialize parent process[%s] ... done' % (ppid))

		log.info('start http start...')
예제 #8
0
 def __init__(self):
     self.mongodb = MongoDBHandler()
     self.komoran = Komoran()
     self.rdate = datetime.today().strftime("%Y%m%d")
     self.articles = []
     self.font_path = '../DownloadLib/NanumFont_TTF_ALL/NanumGothic.ttf'
     self.code_list = []
예제 #9
0
def accuracy(name):
    reslut = []
    if name == 'kma':
        mode = Kkma()
    elif name == 'okt':
        mode = Okt()
    elif name == 'komoran':
        mode = Komoran()
    else :
        return 0
    
    mylin = input ("문장을 입력해 주세요: " )
    
    print("형태소분석기",name,"정확도 분석을 시작합니다. ")
    print('\n')
    acc = mode.morphs(mylin) # 입력문장 형태소 분석
    for sentence in texts:
        arr.append(sentence)
        sp_text = mode.morphs(sentence) # 한줄씩 문장별로 잘라서 형태소 분석
        Jaccard_similarty(acc,sp_text) # 자칼드 유사도로 유사도 계산
    
    n = 5
    Sortsimilarty = sorted(range(len(similarty)),key=lambda i: similarty[i], reverse=True)[:n] # 결과를 sort 해줍니다.
    
    k = 0
    for i in Sortsimilarty:
        k = k+1
        print( k ,"번째로 유사도가 높은 문장입니다. : ",arr[i],"유사도는 다음과 같습니다. : ", similarty[i] )
        
    print('\n')
    Sortsimilarty = []
    similarty = []
예제 #10
0
    def __init__(self, args, konlpy_type='mecab', verbose=True):
        # Token setting
        self.pad_idx = args.pad_idx
        self.bos_idx = args.bos_idx
        self.eos_idx = args.eos_idx
        self.unk_idx = args.unk_idx
        self.sep_idx = args.sep_idx

        # Path setting
        self.save_path = args.save_path

        # Training setting
        self.vocab_size = args.vocab_size
        self.verbose = verbose

        # API setting
        if konlpy_type.lower() == 'mecab':
            self.api = Mecab()
        elif konlpy_type.lower() == 'okt':
            self.api = Okt()
        elif konlpy_type.lower() == 'hannanum':
            self.api = Hannanum()
        elif konlpy_type.lower() == 'kkma':
            self.api = Kkma()
        elif konlpy_type.lower() == 'komoran':
            self.api = Komoran()
        else:
            raise Exception('Not supported konlpy parser')
예제 #11
0
def main():
    twitter = Twitter()
    komoran = Komoran()
    mecab = Mecab()

    argv_dict = {'twitter': twitter, 'komoran': komoran, 'mecab': mecab}

    if len(sys.argv) < 2:
        print('please insert sys_argv \n',
              '1) tokenizer selection: twitter, komoran, mecab \n',
              '2) twitter_tokenizer_option: norm [no argv means True] \n',
              '3) twitter_tokenizer_option: stemming [no argv means True]')
    else:
        output_file_path = input('output_text_file_name: ')
        output_file_path = './data/' + output_file_path
        input_file_path = input('input_text_file_name: ')
        input_file_path = './data/' + input_file_path
        twitter_option = [bool(sys.argv[2]),
                          bool(sys.argv[3])] if len(
                              sys.argv) == 4 else [True, True]
        app_id_list, app_name_list, cate_list, rating_list, review_list = read_jsonl(
            input_file_path, key_ma=False)

        ma_list = []
        for review in tqdm(review_list,
                           desc='tokenizing',
                           total=len(review_list)):
            ma_tokens = get_pos(tokenizer=argv_dict[sys.argv[1]],
                                doc=review,
                                twi_norm=twitter_option[0],
                                twi_stem=twitter_option[1])
            ma_list.append(ma_tokens)

        save_jsonl(output_file_path, app_id_list, app_name_list, cate_list,
                   rating_list, ma_list)
예제 #12
0
def get_tokenizer(name: str, pos: bool=False):
    if name == "komoran":
        komoran = Komoran()
        if pos:
            tokenizer = komoran.pos
        else:
            tokenizer = komoran.morphs
    elif name == "okt":
        okt = Okt()
        if pos:
            tokenizer = okt.pos
        else:
            tokenizer = okt.morphs
    elif name == "mecab":
        mecab = Mecab()
        if pos:
            tokenizer = mecab.pos
        else:
            tokenizer = mecab.morphs
    elif name == "hannanum":
        hannanum = Hannanum()
        if pos:
            tokenizer = hannanum.pos
        else:
            tokenizer = hannanum.morphs
    elif name == "kkma":
        kkma = Kkma()
        if pos:
            tokenizer = kkma.pos
        else:
            tokenizer = kkma.morphs
    else:
        tokenizer = lambda x : x.split()
    return tokenizer
예제 #13
0
 def __init__(self, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         from konlpy.tag import Komoran
         self.tagger = Komoran()
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
예제 #14
0
    def __init__(self,
                 pos_tagger_name='mecab',
                 mecab_path='',
                 exceptional_stop_pos=[],
                 lang='ko',
                 stopwords=[]):
        self.stop_pos = STOP_POS[lang]
        # print(self.stop_pos)

        if pos_tagger_name == 'mecab':
            from konlpy.tag import Mecab
            self.pos_tagger = Mecab(mecab_path)
        elif pos_tagger_name == 'komoran':
            from konlpy.tag import Komoran
            self.pos_tagger = Komoran()
        elif pos_tagger_name == 'nltk':
            self.pos_tagger = None
        else:
            from konlpy.tag import Okt
            self.pos_tagger = Okt()

        if not exceptional_stop_pos:
            self.stop_pos = [
                x for x in self.stop_pos if x not in exceptional_stop_pos
            ]

        self.stopwords = []
        if stopwords:
            self.stopwords = stopwords

        self.graph = nx.diamond_graph()
        self.graph.clear()  # 처음 생성시 graph에 garbage node가 남아있어 삭제

        self.tokens = []
예제 #15
0
def create_dict(post_list):
    dict_list = []
    for query in post_list:
        text = ""
        post_num = 0
        for post in query:
            #if post we are looking at has any text content associated with it.
            if (len(post) > 2):
                post_num += 1
                text += post[2]
        #extract nouns
        nouns = Komoran().nouns(text)
        count_dict = defaultdict(int)

        #count the occurance of each noun.
        for noun in nouns:
            count_dict[noun] += 1
        dict_list.append(count_dict)

    #for each noun, if that noun appeared less than 10% of the times, delete.
    for i in range(len(dict_list)):
        for item in list(dict_list[i]):
            if dict_list[i][item] < post_num / 10:
                del dict_list[i][item]
    return dict_list
예제 #16
0
파일: ko_nlpy.py 프로젝트: arkainoh/nlpu
def tokenize(txt):
  tokens = Komoran().morphs(txt)
  hangul = re.compile('[^\uac00-\ud7a3]+')
  stpwrds = load_ko_stopwords("ko_stopwords.txt")
  tokens = [hangul.sub('', i) for i in tokens]
  tokens = [i for i in tokens if len(i) > 0 and i not in stpwrds]
  return tokens
예제 #17
0
def load_w2v(section, target, max_n, season='all'):
    model = word2vec.Word2Vec.load(f'{save_dir}/{section}_{season}.model')
    li = model.wv.most_similar(positive=[target], topn=max_n)
    komoran = Komoran()
    word_list = []
    dist_list = []
    for word, dist in li:
        temp = [tt[1] for tt in komoran.pos(word)]
        if len(set(temp).intersection(["NNG", "NNP"])) != 0:
            word_list.append(word)
            dist_list.append(dist)

    # 가까운 단어와 거리를 csv로 저장
    df = pd.DataFrame({'word': word_list, 'dist': dist_list})
    df.to_csv(f'{save_dir}/{section}_{target}_{season}.csv', encoding='ms949')

    # 그림으로 저장
    word_list.append('미세먼지')
    x = model[word_list]
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2000)
    x_tsne = tsne.fit_transform(x)
    df2 = pd.DataFrame(x_tsne, index=word_list, columns=['x', 'y'])

    plt.figure(figsize=(16, 9))
    plt.scatter(df2['x'], df2['y'])
    for word, pos in df2.iterrows():
        if word == '미세먼지':
            plt.annotate(word, pos, color='red')
        else:
            plt.annotate(word, pos, va='bottom')
    plt.savefig(f'{save_dir}/{section}_{target}_{season}.png')
    plt.close()
예제 #18
0
def evaluate_cli(model, context_embeddings_op, elmo_context, elmo_ids):
    config = load_config(FLAGS.config_file)
    logger = get_logger(FLAGS.log_file)

    if FLAGS.task == "NER":
        with open(FLAGS.necessary, "rb") as f:
            word_to_id, id_to_word, char_to_id, id_to_char, pumsa_to_id, id_to_pumsa, tag_to_id, id_to_tag, ner_morph_tag = pickle.load(
                f)

    komoran = Komoran()
    results = []
    while True:
        # line = input("문장을 입력하세요.:")
        line = [
            "찬민이의 멘탈이 산산조각났습니다.", "진짜 진짜 진짜 맛있는 진짜 라면", "집에 가고 싶읍니다.", "집",
            "가 가 가 가 가 가, 가, 가 ,가, 가 가 가 가 가 가, 가, 가 ,가 !!!!! ."
        ]
        for idx in range(0, len(line), 5):
            l = line[idx:idx + 2]
            results.extend(
                model.evaluate_lines(
                    sess, context_embeddings_op, elmo_context, elmo_ids,
                    ner_morph_tag,
                    inputs_from_sentences(komoran, l, word_to_id, pumsa_to_id,
                                          char_to_id, elmo_dict,
                                          FLAGS.max_char_length,
                                          ner_morph_tag), id_to_tag))
        print(results)
예제 #19
0
    def __init__(self, keyword,channel, startDate, endDate, nlpEngine, nUrl):

        self.keyword = keyword

        self.channel = channel
        self.startDate = startDate
        self.endDate = endDate

        self.nlpEngine = nlpEngine

        self.nUrl = nUrl

        if nlpEngine == "Okt":
            self.konlpy = Okt()
        elif nlpEngine == "Komoran":
            self.konlpy = Komoran()
        elif nlpEngine == "Kkma":
            self.konlpy = Kkma()
        elif nlpEngine == "Hannanum":
            self.konlpy = Hannanum()
        elif nlpEngine == "Mecab":
            self.konlpy = Mecab()
        elif nlpEngine == "Twitter":
            self.konlpy = Twitter()
        else:
            self.konlpy = Okt()
def noun_tokenizer(df_col_name, preprocessed_df):
    from konlpy.tag import Komoran
    komoran = Komoran()
    globals()["noun_" + str(df_col_name)] = []
    for words in preprocessed_df:
        globals()["noun_" + str(df_col_name)].append(komoran.nouns(words))
    print(globals()["noun_" + str(df_col_name)])
예제 #21
0
def konlpykomo(inputSentence: str, sentenceList: list) -> dict:
    komo = Komoran()
    sentenceDict = dict()

    inputPos = komo.pos(inputSentence)
    inputPosCount = Counter(inputPos)
    inputLen = len(inputPosCount)

    for line in sentenceList:
        if line == '':
            continue
        sentencePos = komo.pos(line)
        sentencePosCount = Counter(sentencePos)
        sentenceLen = len(sentencePosCount)

        if sentenceLen > inputLen:
            common = 0
            for morpheme in inputPosCount:
                if morpheme in sentencePosCount:
                    common += min(inputPosCount[morpheme],
                                  sentencePosCount[morpheme])
                    similarity = 100 * common / inputLen
                    sentenceDict[line] = similarity
        else:
            common = 0
            for morpheme in inputPosCount:
                if morpheme in sentencePosCount:
                    common += min(inputPosCount[morpheme],
                                  sentencePosCount[morpheme])
                    similarity = 100 * common / sentenceLen
                    sentenceDict[line] = similarity

    return sentenceDict
예제 #22
0
def parse(text):
    """
    Returns a list of parsed nouns from [text] using Komoran parser
    """
    komoran = Komoran()
    nouns = komoran.nouns(text)
    return nouns
예제 #23
0
    def preparation(self, flag):
        self.flag = flag
        data = pd.read_csv('./something.csv', encoding='utf-8')  #csv data load
        data_h = pd.read_csv('./something_h.csv',
                             encoding='utf-8')  #csv data load
        print(data.head())  #show data top 5
        # print(data.shape)
        # print(data_h.shape)

        data_list = data.values.tolist()  #pandas to list
        data_list_h = data_h.values.tolist()  #pandas to list

        #choice the tokenizer
        # flag = 1
        if flag == 0:
            tagger = Komoran()
        elif flag == 1:
            tagger = Mecab()

        x_data = data['content']  #assign data with column
        y_data = data['intent']  #assign data with column
        x_data_h = data_h['content']  #assign data with column
        y_data_h = data_h['intent']  #assign data with column

        return x_data, x_data_h, y_data, y_data_h, tagger
예제 #24
0
파일: evaluate.py 프로젝트: KNU-NLPlab/VTT
def control(input_msg):
    tagger = Komoran()

    dataset = Dataset('nsmc/ratings.txt',
                      tagger,
                      max_length=MAX_LENGTH,
                      batch_size=BATCH_SIZE)

    Z_DIM = 40
    H_DIM = 300
    C_DIM = 2

    model = RNN_VAE(dataset.num_words,
                    H_DIM,
                    Z_DIM,
                    C_DIM,
                    freeze_embeddings=False,
                    gpu=USE_CUDA,
                    gpu_id=GPU_ID)

    test_data = torch.LongTensor(
        dataset.sentence2idxs(tagger.morphs(input_msg))).unsqueeze(1)

    model.load_state_dict(torch.load('models/vae_epoch_300_400.bin'))
    results = model.controlSentence(test_data, t=0.5)

    return (dataset.idxs2sentence(results[0], no_pad=True),
            dataset.idxs2sentence(results[1], no_pad=True))
예제 #25
0
def words_check(request):

    # 필요한 라이브러리 및 변수 초기화
    data = request.POST.get('data')
    komoran = Komoran()
    words = Counter(komoran.nouns(data))
    # 1글자 단어 걸러내기
    nouns = dict()
    for data in words.keys():
        if len(data) != 1:
            nouns[data] = words.get(data)
    nouns = sorted(nouns.items(), key=lambda x: x[1], reverse=True)
    hashing = random.choice(range(100))
    context = {
        'nouns': nouns,
        'hashing': hashing,
    }
    # 워드클라우드
    taglist = pytagcloud.make_tags(nouns, minsize=10, maxsize=60)
    link = 'static/wordcloud/wordcloud' + str(hashing) + '.jpg'
    #link = 'static/wordcloud/wordcloud.jpg'
    pytagcloud.create_tag_image(taglist,
                                link,
                                size=(600, 600),
                                layout=3,
                                fontname='CookieRun',
                                rectangular=True)

    return HttpResponse(json.dumps(context), content_type='application/json')
예제 #26
0
def main(base_path, pkl_lst):
    DataFrame = preprocess(base_path, pkl_lst)
    print('Spacing the document...')
    DataFrame = multicore_cpu(DataFrame,
                              spacing_doc,
                              n_cores=args.cpu_core,
                              spell=False)
    print('Spell checking...')
    checked_data = multicore_cpu(DataFrame,
                                 spell_check,
                                 n_cores=args.cpu_core,
                                 spell=True)
    checked_data.reset_index(drop=True, inplace=True)

    # tokenizing
    print('Tokenizing the document...')
    komoran = Komoran(userdic=args.token_dict)
    checked_data['tokenized_contents'] = checked_data['contents'].apply(
        lambda x: komoran.morphs(x))

    # filter documents
    checked_data['doc_length'] = checked_data['tokenized_contents'].apply(
        lambda x: len(x))
    final_data = checked_data.loc[checked_data['doc_length'] > args.token_cnt]
    final_data.reset_index(drop=True, inplace=True)

    # save the output data
    os.makedirs(args.save_path, exist_ok=True)
    with open(os.path.join(args.save_path, 'preprocessed_data.pickle'),
              'wb') as f:
        pickle.dump(final_data, f)
def main(corpora, output):
    filelist = os.listdir(corpora)
    tagger_stan = Komoran()
    tagger_jeju = Komoran(userdic='userdic.txt')  # TODO: If not userdic

    for file in filelist:
        book = openpyxl.load_workbook(os.path.join(corpora, file))
        sheet = book.get_sheet_by_name("Sheet")

        tagged = (bool(sheet.cell(row=1, column=3).value)
                  and bool(sheet.cell(row=1, column=4).value))

        if not tagged:
            for sample in sheet.rows:
                index = sample[0].row
                try:
                    stan = sample[0].value
                    pos_stan = ' '.join(tagger_stan.morphs(stan))
                    jeju = sample[1].value
                    pos_jeju = ' '.join(tagger_jeju.morphs(jeju))
                except:
                    continue
                else:
                    sheet.cell(row=index, column=3).value = pos_stan
                    sheet.cell(row=index, column=4).value = pos_jeju

            book.save(os.path.join(corpora, file))

        filename = file[:file.find('.')]
        if not os.path.exists(output):
            os.makedirs(output)
        output_dir = os.path.join(output, filename +
                                  '.txt')  # Exception: Output Dir not Exists
        output_file = open(output_dir, 'w')

        for sample in sheet.rows:
            try:
                line = '\t'.join([
                    s.value for s in sample[:5]
                ]) + '\n'  # Exception: s.value can be no string
            except TypeError:
                continue
            else:
                output_file.write(line)

        output_file.close()
        book.close()
예제 #28
0
def parse_sentence_pos(line):
    komoran = Komoran()
    idx, raw, label = line.split('\t')
    pos = ""
    for elem in komoran.pos(line):
        pos += elem[0] + '/' + elem[1] + '|'
    pos = pos[-1]
    return idx, pos, raw, label
예제 #29
0
 def __init__(self, file_path, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         from konlpy.tag import Komoran
         self.tagger = Komoran(userdic='./text_rank/dic.txt')
     self.file_path = file_path
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
예제 #30
0
파일: views.py 프로젝트: exid0429/apactory
 def __init__(self, textIter, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         self.tagger = Komoran()
     if type(textIter) == str: self.textIter = textIter.split('\n')
     else: self.textIter = textIter
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')