Exemplo n.º 1
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') 
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
            regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n\n')
Exemplo n.º 2
0
def data_tokenize(news_title, tdm_vocab):

    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    tokenizer = LTokenizer(scores=cohesion_score)

    cluster_data = []
    bert_null_list = []

    for title in news_title:
        title = test(title)
        sent = tokenizer.tokenize(title, flatten=False)
        sentence = []
        for i in sent:
            if i[0] in tdm_vocab:
                sentence.append(i[0])

        cluster_data.append(sentence)

    return cluster_data
Exemplo n.º 3
0
def prediction(text):
    params = Params('config/params.json')

    # load tokenizer and torchtext Fields
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_kor = open('pickles/kor.pickle', 'rb')
    kor = pickle.load(pickle_kor)
    pickle_eng = open('pickles/eng.pickle', 'rb')
    eng = pickle.load(pickle_eng)
    eos_idx = eng.vocab.stoi['<eos>']

    # select model and load trained model
    model = Transformer(params)
    model.load_state_dict(torch.load(params.save_model))
    model.to(params.device)
    model.eval()

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(text)
    indexed = [kor.vocab.stoi[token] for token in tokenized]


    source = torch.LongTensor(indexed).unsqueeze(0).to(params.device)  # [1, source_len]: unsqueeze to add batch size
    target = torch.zeros(1, params.max_len).type_as(source.data)       # [1, max_len]

    encoder_output = model.encoder(source)
    next_symbol = eng.vocab.stoi['<sos>']

    for i in range(0, params.max_len):
        if next_symbol == eos_idx:
            break
        target[0][i] = next_symbol
        decoder_output, _ = model.decoder(target, source, encoder_output)  # [1, target length, output dim]
        prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[i]
        next_symbol = next_word.item()

    #eos_idx = torch.where(target[0] == eos_idx)[0][0]
    #eos_idx = eos_idx.item()
    eos_index = 34
    print(eos_idx)
    target = target[0][:eos_idx].unsqueeze(0)

    # translation_tensor = [target length] filed with word indices
    target, attention_map = model(source, target)
    target = target.squeeze(0).max(dim=-1)[1]

    reply_token = [eng.vocab.itos[token] for token in target if token != 3]
    print(reply_token)
    #translation = translated_token[:translated_token.index('<eos>')]
    #translation = ''.join(translation)
    reply = ' '.join(reply_token)
    #print(reply)

    #display_attention(tokenized, reply_token, attention_map[4].squeeze(0)[:-1])
    return reply 
Exemplo n.º 4
0
class LTokenizerKorean(SpecialTokenizer):  # 어근 중심 tokenizer
    def __init__(self, scores=None):
        from soynlp.tokenizer import LTokenizer
        self.inst=LTokenizer(scores=scores)  # scores를 preference로 지정할 수 있고, 지정하지 않으면 cohesion score로 알아서 계산됨
        self.OUT_TYPE = [list, str]

    def __call__(self, *args, **kwargs):
        tokens = self.inst.tokenize(args[0])
        return tokens
Exemplo n.º 5
0
class LTokenizerKorean(SpecialTokenizer):
    def __init__(self, scores=None):
        from soynlp.tokenizer import LTokenizer
        self.inst = LTokenizer(scores=scores)

        self.OUT_TYPE = [list, str]

    def __call__(self, *args, **kwargs):
        tokens = self.inst.tokenize(args[0])
        return tokens
Exemplo n.º 6
0
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 10 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            soy_tokens = soy_tokenizer.tokenize(word)
            if ' '.join(tokens) == ' '.join(soy_tokens):
                continue

            if is_all_nng(mcab.pos(word)):
                #print("nouns only : {}".format(word))
                #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))
                continue

            if len(soy_tokens) > 1:
                continue

            #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))

            words = re.findall(' '.join(tokens), sentences)
            if len(words) < (cnt * 0.05):
                # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류
                (cho, jung, jong) = hgtk.letter.decompose(word[-1])
                if 'ㄱ' <= jong <= 'ㅎ':
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
                else:
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
                print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong))
                f2.writelines(dic_line + '\n')
                f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
Exemplo n.º 7
0
def predict(config):
    params = Params('config/params.json')

    # load tokenizer and torchtext Fields
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_kor = open('pickles/kor.pickle', 'rb')
    kor = pickle.load(pickle_kor)

    pickle_eng = open('pickles/eng.pickle', 'rb')
    eng = pickle.load(pickle_eng)

    # select model and load trained model
    model = Transformer(params)

    model.load_state_dict(torch.load(params.save_model))
    model.to(params.device)
    model.eval()

    input = clean_text(config.input)

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(input)
    indexed = [kor.vocab.stoi[token] for token in tokenized]

    source = torch.LongTensor(indexed).unsqueeze(0).to(
        params.device)  # [1, source length]: unsqueeze to add batch size
    target = torch.zeros(1, params.max_len).type_as(source.data)

    encoder_output = model.encoder(source)
    next_symbol = eng.vocab.stoi['<sos>']

    for i in range(0, params.max_len):
        target[0][i] = next_symbol
        dec_output = model.decoder(target, source, encoder_output)
        # dec_output = [1, target length, output dim]
        prob = dec_output.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[i]
        next_symbol = next_word.item()

    # translation_tensor = [target length] filed with word indices
    target = model(source, target)
    target = torch.argmax(target.squeeze(0), -1)
    # target = target.squeeze(0).max(dim=-1, keepdim=False)
    translation = [eng.vocab.itos[token] for token in target][1:]

    translation = ' '.join(translation)
    print(f'kor> {config.input}')
    print(f'eng> {translation.capitalize()}')
Exemplo n.º 8
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError(
            "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
                regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".
            format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({
        '데이터': 0.4,
        '데이': 0.35,
        '데이터센터': 0.38
    })
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n')
Exemplo n.º 9
0
def soy_tokenize(model_fname, input_sentence):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward *
              math.exp(scores[key].right_branching_entropy))
        for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    tokens = tokenizer.tokenize(input_sentence)
    tokenized_sent = ' '.join(tokens)

    return tokenized_sent
def predict_sequential(config):
    # load tokenizer and torchtext Field
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_vocab = open('pickles/text.pickle', 'rb')
    text = pickle.load(pickle_vocab)
    pad_idx = text.vocab.stoi[text.pad_token]

    model_type = {
        'vanilla_rnn': RNN(config, pad_idx),
        'bidirectional_lstm': BidirectionalLSTM(config, pad_idx),
    }

    # select model and load trained model
    model = model_type[config.model]
    model.load_state_dict(torch.load(config.save_model))
    model.eval()

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(config.input)
    indexed = [text.vocab.stoi[token] for token in tokenized]
    length = [len(indexed)]

    tensor = torch.LongTensor(indexed).to(device)  # [input length]
    tensor = tensor.unsqueeze(
        1)  # [input length, 1] for adding batch dimension
    length_tensor = torch.LongTensor(length)

    prediction = torch.sigmoid(model(tensor, length_tensor))
    label = torch.round(prediction)

    if label == 1:
        label = 'Positive'
    else:
        label = 'Negative'

    sentiment_percent = prediction.item()
    print(f'[in]  >> {config.input}')
    print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
def predict_cnn(config):
    # load tokenizer and torchtext Field
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_vocab = open('pickles/text.pickle', 'rb')
    text = pickle.load(pickle_vocab)

    model = CNN(config)

    model.load_state_dict(torch.load(config.save_model))
    model.to(device)
    model.eval()

    tokenized = tokenizer.tokenize(config.input)

    min_len = config.filter_sizes[-1]

    # if user's input sentence is shorter than the largest filter size, add pad tokens to input sentence
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))

    indexed = [text.vocab.stoi[token] for token in tokenized]
    length = [len(indexed)]

    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)

    prediction = torch.sigmoid(model(tensor, length_tensor))
    label = torch.round(prediction)

    if label == 1:
        label = 'Positive'
    else:
        label = 'Negative'

    sentiment_percent = prediction.item()
    print(f'[in]  >> {config.input}')
    print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
Exemplo n.º 12
0
def content_to_token(text_file_name):
    print("opening file " + text_file_name)
    with open(text_file_name, 'r', encoding="utf-8") as f:
        lines = f.read().splitlines()
    re.sub(r"[\[\]<>~]", ' ', lines[0])
    re.sub(r"['~]", ' ', lines[0])
    re.sub(r'"', ' ', lines[0])

    text = []
    for line in lines:
        line = re.sub(r"[\[\]<>~]", ' ', line)
        line = re.sub(r"['~]", ' ', line)
        line = re.sub(r'"', ' ', line)
        line = re.sub('\\W', ' ', line)
        text.append(line)

    ltokenizer = LTokenizer(scores=scores_dictionary)

    print("making list of words")
    words = []
    for sent in text:
        conclude_sent = []
        #flatten을 False로 주어서 [L명사, R조사]형태로 분류하게 만듦.
        pre_list = ltokenizer.tokenize(sent, flatten=False)
        for LR_list in pre_list:
            word = LR_list[0]
            if word in word_dict:
                word = word_dict[word]
            if word not in exception_list:
                conclude_sent.append(word)
        words.append(conclude_sent)

    token_file_name = text_file_name[:-4] + '.csv'

    f = open(token_file_name, 'w', newline="")
    wr = csv.writer(f)
    for word in words:
        wr.writerow(word)
    f.close()
def pad_sentence(dataframe, min_len):
    """
    to use CNN, all the inputs has the minimum length as same as the largest filter size
    if the input is shorter than the largest CNN filter size, we should pad that input using pad_sentence method
    Args:
        dataframe: (DataFrame) dataframe used to train and validate the model
        min_len: (integer) the largest CNN filter size used to set the minimum length of the model

    Returns:

    """
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    for i, row in dataframe.iterrows():
        tokenized = tokenizer.tokenize(row.document)
        if len(tokenized) < min_len:
            tokenized += ['<pad>'] * (min_len - len(tokenized))
        padded_sent = ' '.join(tokenized)
        dataframe.at[i, 'document'] = padded_sent

    return dataframe
Exemplo n.º 14
0
def soy_tokenize(corpus_fname, model_fname, output_fname):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0
                                   )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()

    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            normalized_sent = emoticon_normalize(sentence, num_repeats=3)
            tokens = tokenizer.tokenize(normalized_sent)
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
Exemplo n.º 15
0
class SoyTokenizer:
    def __init__(self, model_path: str = None):
        self.word_extractor = WordExtractor(min_frequency=5,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)
        self.unk = 0
        self.pad = 1
        self.sos = 2
        self.eos = 3

        if model_path:
            with open(model_path, 'rb') as readFile:
                self.cohesion_score = dill.load(readFile)
        else:
            self.cohesion_score = {}
        self.tokenizer = LTokenizer(scores=self.cohesion_score)
        self.tok_to_id, self.id_to_tok = self._build_dict()

    def tokenize(self, sent: str):
        return self.tokenizer.tokenize(sent)

    def text_to_id(self, sent: str):
        toks = self.tokenize(sent)
        outp = []
        for s in toks:
            try:
                outp.append(self.tok_to_id[s])
            except KeyError:
                outp.append(self.unk)
        return outp

    def id_to_text(self, idxs: list):
        return [self.id_to_tok[i] for i in idxs]

    def train(self, sentences, add_whitespace: bool = False):
        sentences = self.preprocess(sentences)
        self.word_extractor.train(sentences)
        words = self.word_extractor.extract()
        self.cohesion_score = {
            word: score.cohesion_forward
            for word, score in words.items()
        }

        # add whitespace tokens
        if add_whitespace:
            whitetokens = []
            for s in sentences:
                whitetokens += s.split(' ')
            whitetokens = list(set(whitetokens))

            for t in whitetokens:
                self.cohesion_score.update({t: 1.0})

        self.tok_to_id, self.id_to_tok = self._build_dict()

    def save_model(self, model_path: str, model_prefix: str):
        with open(os.path.join(model_path, model_prefix + '.model'),
                  'wb') as saveFile:
            dill.dump(self.cohesion_score, saveFile)

    def _build_dict(self):
        tok_to_id = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3}
        id_to_tok = {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>'}
        for i, key in enumerate(self.cohesion_score.keys()):
            tok_to_id[key] = i + 4
            id_to_tok[i + 4] = key
        return tok_to_id, id_to_tok

    def preprocess(self, sents: list):
        n_str_pattern = re.compile(pattern='[\\d\\-?/_!\\.,]')
        doublespacing = re.compile(pattern='\\s\\s+')

        sents = [n_str_pattern.sub(repl=' ', string=w) for w in sents]
        sents = [doublespacing.sub(repl=' ', string=w).strip() for w in sents]
        sents = [u.lower() for u in sents]
        return sents

    def __len__(self):
        return len(self.cohesion_score)
Exemplo n.º 16
0
    for noun, score in noun_scores.items()
}
print("update combining scores")
combined_scores.update({
    subword: cohesion
    for subword, cohesion in cohesion_scores.items()
    if not (subword in combined_scores)
})

## tokenizing

#%%
print("making tokenizer object")
ltokenizer = LTokenizer(scores=combined_scores)

#%%
print("making list of words")
words = [ltokenizer.tokenize(sent) for sent in text]
word_list = []

f = open('token_words.csv', 'w', newline="")
wr = csv.writer(f)
for word in words:
    wr.writerow(word)
    word_list.append(word)
f.close()

with open('token_word_list.pickle', 'wb') as fw:
    pickle.dump(word_list, fw)
    print("dumping complete")
Exemplo n.º 17
0
            trans_raw.append(chat2)
        else:
            trans_raw.append(chat)
    return trans_raw

raw_time, raw_chat = read_data("399807785.csv")
raw_chat = laugh_trans(raw_chat)

word_extractor.train(raw_chat)
test_words = word_extractor.extract()
test_score = {word:score.cohesion_forward for word, score in test_words.items()}
tokenizer = LTokenizer(scores=test_score)
test_list = []
cnt = 0
for sent in raw_chat:
    test_list.append([tokenizer.tokenize(sent)])
    cnt += 1

test_tokens = [token for data in test_list for token in data[0]]

test_text = nltk.Text(test_tokens)
selected_tokens= [t[0] for t in test_text.vocab().most_common(500)]
def term_frequency(data):
    return [data.count(word) for word in selected_tokens]
test_x = [term_frequency(d) for d in test_list]
X_test = np.asarray(test_x).astype('float32')

cnt = 0
print(len(model.predict_classes(X_test))) #13374
for i in range(len(X_test)):
    if model.predict_classes(X_test)[i] == [0]:
Exemplo n.º 18
0
class KoreanTokenizer:
    '''
    A class to tokenize a Korean sentence.

    Attributes
    ----------
    **kwargs
        | Keyword arguments for WordExtractor object (see soynlp.word.WordExtractor)

    Methods
    -------
    train
        | Trains KoreanTokenizer on a corpus
    tokenize
        | Tokenizes the input sentence and returns its tokens
    
    '''

    from soynlp.word import WordExtractor
    from soynlp.utils import check_corpus
    from soynlp.utils import DoublespaceLineCorpus
    from soynlp.tokenizer import LTokenizer

    def __init__(self, **kwargs):
        if 'sents' in kwargs.keys():
            del kwargs['sents']
            print("WARNING: 'sents' argument is ignored.")

        self.WordExtractor = WordExtractor(**kwargs)

    def train(self, text, **kwargs):
        '''
        A method to train the KoreanTokenizer object.

        Attributes
        ----------
        text : iterable or DoublespaceLineCorpus
            | A input text in any iterable type (e.g. list)
            | or a DoublespaceLineCorpus object (see soynlp.utils.DoublespaceLineCorpus)
        **kwargs
            | Keyword arguments for WordExtractor.train() method (see soynlp.word.WordExtractor.train)
        '''

        if 'sents' in kwargs.keys():
            del kwargs['sents']
            print("WARNING: 'sents' argument is ignored; WordExtractor is trained on 'text' argument only.")
        
        self.WordExtractor.train(text, **kwargs)
        self.words = self.WordExtractor.extract()

        def calculate_word_score(word, score):
            cohesion = score.cohesion_forward
            branching_entropy = score.right_branching_entropy
            
            word_score = cohesion * exp(branching_entropy)

            return word_score

        self.word_score = {word:calculate_word_score(word, score) for word, score in words.items()}

    def tokenize(self, text, **kwargs):
        '''
        A method to tokenize the input text

        Attributes
        ----------
        text : str
            | An input text in str type

        **kwargs
            | Keyword arguments for LTokenizer.tokenize() method (see soynlp.tokenizer.LTokenizer.tokenize)
        '''
        
        if 'sentence' in kwargs.keys():
            del kwargs['sentence']
            print("WARNING: 'sentence' argument is ignored; word_tokenizer tokenizes 'text' argument only.")

        if not self.word_score:
            print('KoreanTokenizer should be trained first, before tokenizing.')
            return
        
        self.tokenizer = LTokenizer(scores=self.word_score)
        
        result = self.tokenizer.tokenize(text, **kwargs)

        return result
Exemplo n.º 19
0
    '있다', '수', '에', '이', '한다', '있습니다', '것으로', '있는', '것', '할', '및', 'the',
    'http', 'https', 'sunday', 'joins', 'co', 'and', 'kr', '고', '것이다', '한',
    'is', 'www', 'for', 'a', 'of', 'in', 'on', '중', '더', '대', '통해'
]
## noun score
# 명사만한
noun_extractor = LRNounExtractor_v2(verbose=True,
                                    extract_compound=False)  # 복합어 추출 X
nouns = noun_extractor.train_extract(text)  # list of str like
noun_scores = {noun: score.score for noun, score in nouns.items()}
print("extracting noun")

ltokenizer = LTokenizer(scores=noun_scores)

print("making list of words")
words = []
for sent in text:
    conclude_sent = []
    #flatten을 False로 주어서 [L명사, R조사]형태로 분류하게 만듦.
    pre_list = ltokenizer.tokenize(sent, flatten=False)
    for LR_list in pre_list:
        if LR_list[0] not in exception_list:
            conclude_sent.append(LR_list[0])
    words.append(conclude_sent)

f = open('noun_token_words_4.csv', 'w', newline="")
wr = csv.writer(f)
for word in words:
    wr.writerow(word)
f.close()
Exemplo n.º 20
0
for noun in nouns:
    nounScore[noun] = nouns[noun].score
    dictionary[noun] = index
    index += 1

del index
#여기서 빼낸 명사랑, 벡터화~키워드 추출에서 사용하는 단어의 수가 다른듯한데
nounData = list(dictionary.keys())
'''
writefp = open("C:\\users\\user\Desktop\SWProject\SWproject_getKeywords\WordScore_pebble.txt", "w", encoding = "utf-8")
writefp.write(str(nounScore))
writefp.close()
'''

tokenizer = LTokenizer(scores=nounScore)
tokenized_text = tokenizer.tokenize(text)

vectorizer = BaseVectorizer(tokenizer=tokenizer,
                            min_tf=0,
                            max_tf=10000,
                            min_df=0,
                            max_df=1.0,
                            stopwords=None,
                            lowercase=True,
                            verbose=True)

sents.iter_sent = False
#x = vectorizer.fit_transform(sents)
import tensorflow as tf

vectorPath = "./vectorizedPebble.mtx"
Exemplo n.º 21
0
word_extractor.train(news_title)
words = word_extractor.extract()

cohesion_score = {
    word: score.cohesion_forward
    for word, score in words.items()
}
tokenizer = LTokenizer(scores=cohesion_score)

# # --------------------------word2vec 데이터 전처리--------------------

cluster_data = []

for k, title in enumerate(news_title):
    title = test(title)
    sent = tokenizer.tokenize(title, flatten=False)
    sentence = []
    # sent -> ['단어', '']
    for i in sent:
        sentence.append(i[0])

    cluster_data.append(sentence)

# --------------------------Word2Vec embedding--------------------

from gensim.models import Word2Vec

model = Word2Vec(cluster_data,
                 size=100,
                 window=3,
                 min_count=5,
Exemplo n.º 22
0
def predict(config):
    params_dict = {
        'seq2seq': Params('configs/params.json'),
        'seq2seq_gru': Params('configs/params_gru.json'),
        'seq2seq_attention': Params('configs/params_attention.json'),
    }

    params = params_dict[config.model]

    # load tokenizer and torchtext Fields
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_kor = open('pickles/kor.pickle', 'rb')
    kor = pickle.load(pickle_kor)

    pickle_eng = open('pickles/eng.pickle', 'rb')
    eng = pickle.load(pickle_eng)

    model_type = {
        'seq2seq': Seq2Seq,
        'seq2seq_gru': Seq2SeqGRU,
        'seq2seq_attention': Seq2SeqAttention,
    }

    # select model and load trained model
    model = model_type[config.model](params)

    model.load_state_dict(torch.load(params.save_model))
    model.to(params.device)
    model.eval()

    input = clean_text(config.input)

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(input)
    indexed = [kor.vocab.stoi[token] for token in tokenized]

    source_length = torch.LongTensor([len(indexed)]).to(params.device)

    tensor = torch.LongTensor(indexed).unsqueeze(1).to(
        params.device)  # [source length, 1]: unsqueeze to add batch size

    if config.model == 'seq2seq_attention':
        translation_tensor_logits, attention = model(tensor, source_length,
                                                     None, 0)
        # translation_tensor_logits = [target length, 1, output dim]

        translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1),
                                          1)
        # translation_tensor = [target length] filed with word indices

        translation = [eng.vocab.itos[token]
                       for token in translation_tensor][1:]
        attention = attention[1:]

        display_attention(tokenized, translation, attention)
    else:
        translation_tensor_logits = model(tensor, source_length, None, 0)
        translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1),
                                          1)
        translation = [eng.vocab.itos[token]
                       for token in translation_tensor][1:]

    translation = ' '.join(translation)
    print(f'kor> {config.input}')
    print(f'eng> {translation.capitalize()}')
Exemplo n.º 23
0
class Embedding:

    MODEL_SAVED_DIR = "saved_model/fasttext.model"
    TOKENIZER_SAVED_DIR = "saved_model\\tokenizer.pkl"

    def __init__(self, dataset: pd.DataFrame, word_train: bool):
        self.dataset = dataset
        self.corpus = dataset["TITLE"] + dataset["TEXTCONTENT"]

        if word_train == False:
            self.fasttext = FastText.load(self.MODEL_SAVED_DIR)
            self._load_tokenizer()
            self._tokenize()
        else:
            self._extracte()
            self._tokenize()
            self._save_tokenizer()
            self._train()

        self.idx_word_dict = dict(
            zip(np.arange(4,
                          len(self.fasttext.wv.vectors) + 4),
                self.fasttext.wv.index2word))
        self.idx_word_dict[0] = '<PAD>'
        self.idx_word_dict[1] = '<STA>'
        self.idx_word_dict[2] = '<EOS>'
        self.idx_word_dict[3] = '<UNK>'

    def _extracte(self) -> None:
        self.extractor = WordExtractor()
        self.extractor.train(self.corpus)
        self.words = self.extractor.extract()
        self.cohesion_score = {
            word: score.cohesion_forward
            for word, score in self.words.items()
        }
        self.tokenizer = LTokenizer(scores=self.cohesion_score)

    def _tokenize(self) -> pd.DataFrame:
        self.corpus = self.corpus.apply(
            lambda text: self.tokenizer.tokenize(text))
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            lambda text: self.tokenizer.tokenize(text))
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            lambda text: self.tokenizer.tokenize(text))

    def _save_tokenizer(self) -> None:
        with open(self.TOKENIZER_SAVED_DIR, "wb") as f:
            pickle.dump(self.tokenizer, f, pickle.HIGHEST_PROTOCOL)

    def _load_tokenizer(self) -> None:
        with open(self.TOKENIZER_SAVED_DIR, "rb") as f:
            self.tokenizer = pickle.load(f)

    def _train(self) -> None:
        self.fasttext = FastText(sentences=self.corpus,
                                 size=100,
                                 window=5,
                                 min_count=1,
                                 iter=100)
        self.fasttext.save(self.MODEL_SAVED_DIR)

    def dataset_to_embedding(self) -> pd.DataFrame:
        self.dataset["TITLE_IDX"] = self.dataset["TITLE"].apply(
            self._sentence_length_fix, args=[10])
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            self._sentence_length_fix, args=[10])
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            self._sentence_length_fix, args=[32])

        for index, value in self.dataset["TITLE_IDX"].iteritems():
            assert len(value) == 10

        for index, value in self.dataset["TITLE"].iteritems():
            assert len(value) == 10

        for index, value in self.dataset["TEXTCONTENT"].iteritems():
            assert len(value) == 32

        self.dataset["TITLE_IDX"] = self.dataset["TITLE_IDX"].apply(
            lambda tokenized: np.array(
                [self._word_to_idx(token) for token in tokenized]))
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            lambda tokenized: np.array(
                [self._word_to_vec(token) for token in tokenized]))
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            lambda tokenized: np.array(
                [self._word_to_vec(token) for token in tokenized]))

        return self.dataset

    def embedding_to_sentence(self, target: list or np.array) -> list:
        return [self._vec_to_word(vector) for vector in target]

    def _sentence_length_fix(self, sentence: list or np.array,
                             length: int) -> list or np.array:
        sentence_length = len(sentence)
        if sentence_length < length:
            while len(sentence) < length:
                sentence.append('<PAD>')
        elif sentence_length > length:
            sentence = sentence[:length]
        return sentence

    def _vec_to_word(self, vector) -> str:
        if np.array_equal(vector, np.eye(100, dtype=np.float32)[0]):
            return '<PAD>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[1]):
            return '<STA>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[2]):
            return '<EOS>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[3]):
            return '<UNK>'
        return self.fasttext.wv.most_similar(positive=[vector], topn=1)[0][0]

    def _word_to_vec(self, word) -> np.array:
        try:
            if word == '<PAD>': return np.eye(100, dtype=np.float32)[0]
            elif word == '<STA>': return np.eye(100, dtype=np.float32)[1]
            elif word == '<EOS>': return np.eye(100, dtype=np.float32)[2]
            elif word == '<UNK>': return np.eye(100, dtype=np.float32)[3]
            return self.fasttext.wv.word_vec(word)
        except:
            return np.eye(100, dtype=np.float32)[3]

    def _word_to_idx(self, word) -> int:
        try:
            return list(self.idx_word_dict.keys())[list(
                self.idx_word_dict.values()).index(word)]
        except:
            return 3

    def _idx_to_word(self, idx) -> str:
        return self.idx_word_dict[idx]
Exemplo n.º 24
0
class KoreanTokenizer:
    '''
    A class to tokenize a Korean sentence.

    Attributes
    ----------
    pre_trained : bool
        | If True, one of pre-trained Korean analyzer, provided by KoNLPy, will be used (default : True)
        | If False, unsupervised KoreanTokenizer is initialized, based on soynlp L-Tokenizer. Argument 'anaylzer' is ignored.
    analyzer : str
        | Type of KoNLPy analyzer (default : Hannanum)
        | Available analyzers are: Hannanum, Kkma, Komoran, Mecab, Okt
        | Note: Mecab needs to be installed separately before being used.

    Methods
    -------
    train
        | Trains KoreanTokenizer on a corpus, only when 'pre_trained' argument is False.
    tokenize
        | Tokenizes the input sentence and returns its tokens.
    extract_noun
        | Extracts nouns from the input sentence.
    
    '''

    def __init__(self, pre_trained=True, analyzer='Hannanum'):
        self.pre_trained = pre_trained

        if analyzer == 'Hannanum':
            self.analyzer = tag.Hannanum()
        elif analyzer == 'Kkma':
            self.analyzer = tag.Kkma()
        elif analyzer == 'Komoran':
            self.analyzer = tag.Komoran()
        elif analyzer == 'Mecab':
            self.analyzer = tag.Mecab()
        elif analyzer == 'Okt':
            self.analyzer = tag.Okt()
        else:
            if pre_trained == False:
                pass
            else:
                print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt')

        self.WordExtractor = WordExtractor(min_frequency=0)
        self.noun_extractor = LRNounExtractor(verbose=False)
        self.word_score = {}

    def train(self, text):
        '''
        A method to train the KoreanTokenizer on a corpus.
        If KoreanTokenizer.pre_trained == False, this method does nothing.

        Attributes
        ----------
        text : str
            | An input text in str type
        '''

        if self.pre_trained == True:
            print('A pre-trained KoreanTokenizer is being used. No need to train it.')
            return

        else:
            self.WordExtractor.train(text)
            self.words = self.WordExtractor.extract()

            def calculate_word_score(word, score):
                cohesion = score.cohesion_forward
                branching_entropy = score.right_branching_entropy
                
                word_score = cohesion * exp(branching_entropy)

                return word_score

            self.word_score = {word:calculate_word_score(word, score) for word, score in self.words.items()}

    def tokenize(self, text):
        '''
        A method to tokenize input text.

        Attriubutes
        -----------
        text : str
            | An input text to be tokenized

        Output
        ------
        tokens : list
            | List of tokens (in str) that consist of the input text

        '''

        if self.pre_trained == True:
            return self.analyzer.morphs(text)

        else:
            if not self.word_score:
                print('An unsupervised KoreanTokenizer should be trained first, before tokenizing.')
                return
            
            self.tokenizer = LTokenizer(scores=self.word_score)

            result = self.tokenizer.tokenize(text)

            return result

    def extract_noun(self, text):
        '''
        A method to extract nouns from input text

        Attributes
        ----------
        text : str
            | An input text from which nouns will be extracted

        Output
        ------
        nouns : list
            | List of noun tokens (in str) in the input text
        '''

        if self.pre_trained == True:
            return self.analyzer.nouns(text)
Exemplo n.º 25
0
class Tokenizer:
    """Tokenizer class"""
    def __init__(self):
        # load noun cohesion score
        with open('utils/words.p', 'rb') as rf:
            words = pickle.load(rf)
            cohesion_score = {
                word: score.cohesion_forward
                for word, score in words.items()
            }
            cohesion_score = {
                k: v
                for k, v in sorted(cohesion_score.items(),
                                   key=lambda item: item[1],
                                   reverse=True) if v > 0
            }
        with open('utils/nouns.p', 'rb') as rf:
            nouns = pickle.load(rf)
            noun_score = {noun: score.score for noun, score in nouns.items()}
            noun_cohesion_score = {
                noun: score + cohesion_score.get(noun, 0)
                for noun, score in noun_score.items()
            }
            self._noun_cohesion_score = {
                k: v
                for k, v in sorted(noun_cohesion_score.items(),
                                   key=lambda item: item[1],
                                   reverse=True) if v > 0
            }

        self._soy = LTokenizer(scores=self._noun_cohesion_score)
        self._is_flatten = False  # no_flatten
        self._is_remove_r = False  # no_remove
        self._emo = get_emoji_regexp()  # re compiled

    def _preprocess(self, doc: str) -> str:
        """전처리 로직"""
        doc = str(doc).lower().strip()  # make string, lower and strip
        doc = re.sub(rf'([^{self._emo.pattern}{string.punctuation}\s\w]+)',
                     ' ',
                     doc)  # 숫자, 문자, whitespace, 이모지, 일반특수문자를 제외한 모든 유니코드 제거.
        doc = re.sub(r'\s', ' ', doc)  #white space character 변환
        doc = re.sub('&nbsp;', ' ', doc)  #&nbsp; 제거
        doc = re.sub('&lt;', '<', doc)  #기타 html특수기호
        doc = re.sub('&gt;', '>', doc)  #기타 html특수기호
        doc = re.sub('&amp;', '&', doc)  #기타 html특수기호
        doc = re.sub('&quot;', '""', doc)  #기타 html특수기호
        doc = re.sub(r'(http\S+[^가-힣])|([a-zA-Z]+.\S+.\S+[^가-힣])', r' [URL] ',
                     doc)  #url 변환
        doc = re.sub(r'(\[image#0\d\])', r' [IMAGE] ', doc)  # Image Tag
        doc = re.sub(r'([0-9a-zA-Z_]|[^\s^\w])+(@)[a-zA-Z]+.[a-zA-Z)]+',
                     r' [EMAIL] ', doc)  #email
        doc = re.sub(r'#(\w+)', r' [HASHTAG] ', doc)  #Hashtag
        doc = re.sub(r'@(\w+)', r' [MENTION] ', doc)  #MENTION
        doc = emojize(demojize(doc, delimiters=(' :', ': '))).strip()
        return doc

    def _postprocess(self, doc: List[str]) -> List[Tuple[str]]:
        """후처리 로직"""
        processed_doc = []

        for l_part, r_part in doc:

            ## l_part
            l_part = repeat_normalize(l_part, num_repeats=3)
            sub_l_part = re.findall(r"[\w]+|[\W]+", l_part)
            if len(sub_l_part) == 2:
                processed_doc += [(sub, 'L') for sub in sub_l_part]
            else:
                processed_doc.append((l_part, 'L'))

            ## r_part
            if r_part != '':
                r_part = repeat_normalize(r_part, num_repeats=3)
                sub_r_part = re.findall(r"[\w]+|[\W]+", r_part)
                if len(sub_r_part) == 2:
                    processed_doc += [(sub, 'R') for sub in sub_r_part]
                else:
                    processed_doc.append((r_part, 'R'))

        return processed_doc

    def tokenize(self, doc: str, media_type: str = None) -> List[Tuple[str]]:
        """tokenize function
        Use noun cohesion score with soynlp
        
        doc :
        media_type :
        """

        doc = self._soy.tokenize(
            self._preprocess(doc),
            flatten=self._is_flatten,
            remove_r=self._is_remove_r)  # returns list of tuple
        doc = self._postprocess(doc)

        return doc
Exemplo n.º 26
0
    return s
etri_processed_data["title"] = etri_processed_data["title"].progress_apply(concat_text_with_pos)


word_extractor = WordExtractor(
    min_frequency=100,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
)
soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model'
word_extractor.load(soynlp_model_fname)
scores = word_extractor.word_scores()
scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
soyToken = LTokenizer(scores=scores)
# soyToken.tokenize(soynlp_processed_data["title"].values[0])
soynlp_processed_data["title"] = soynlp_processed_data["title"].progress_apply(lambda x: " ".join(soyToken.tokenize(x)))

token = spm.SentencePieceProcessor()
token.Load("./backend/textengines/data/tokenizer_model/sentencepice.model")
spm_processed_data["title"] = spm_processed_data["title"].progress_apply(lambda x: " ".join(token.EncodeAsPieces(x)))
#############################################################################

td = etri_processed_data.copy()

ratio_train = 0.8
ratio_val = 0.1
ratio_test = 0.1

# Produces test split.
x_remaining, x_test, y_remaining, y_test = train_test_split(
    td["title"], 
Exemplo n.º 27
0
from soynlp.tokenizer import LTokenizer

scores = {
    '날씨': 0.5,
    '맑다': 0.5,
    '흐리다': 0.5,
    '흐림': 0.45,
    '오늘': 0.4,
    '내일': 0.4,
    '대체로': 0.2,
    '것': 0.01
}

tokenizer = LTokenizer(scores=scores)

sent = '오늘의 날씨는 대체로 맑고, 내일의 날씨는 흐릴 것이다.'

print(tokenizer.tokenize(sent, flatten=False))
print(tokenizer.tokenize(sent))
word_extractor = WordExtractor(min_frequency=150,
                               min_cohesion_forward=0.05,
                               min_right_branching_entropy=0.0)

word_extractor.train(x_train)
train_words = word_extractor.extract()
train_score = {
    word: score.cohesion_forward
    for word, score in train_words.items()
}
tokenizer = LTokenizer(scores=train_score)
train_list = []
cnt = 0
for sent in x_train:
    train_list.append([tokenizer.tokenize(sent), y_train[cnt]])
    cnt += 1

word_extractor.train(x_test)
test_words = word_extractor.extract()
test_score = {
    word: score.cohesion_forward
    for word, score in test_words.items()
}
tokenizer = LTokenizer(scores=test_score)
test_list = []
cnt = 0
for sent in x_test:
    test_list.append([tokenizer.tokenize(sent)])
    cnt += 1
Exemplo n.º 29
0
def Makegraph_Wordcloud_Soynlp(target):
    try:
        if flag_login == 0 or flag_login == None or flag_login == '':
            Login()
        #elif flag_prepro == 0:
        #messagebox.showwarning('주의', '데이터 전처리 후 실행해주세요.')
        #return
        else:
            data_wordcloud_soynlp = pd.DataFrame(data_origin[target],
                                                 columns=['contents'])
            data_wordcloud_soynlp['contents'] = data_origin[target].apply(
                lambda x: re.sub('[^가-힣]', ' ', x))

            word_extractor = WordExtractor(
                min_frequency=10,  # 가변화하기 (ex. data_origin.len() 비례)
                min_cohesion_forward=0.05,
                min_right_branching_entropy=0.0)
            word_extractor.train(data_wordcloud_soynlp['contents'].values)
            words = word_extractor.extract()

            cohesion_score = {
                word: score.cohesion_forward
                for word, score in words.items()
            }  # force : 여기인가?
            # force join words
            cohesion_score['숙소제공'] = 1
            cohesion_score['교통비지급'] = 1
            cohesion_score['인센티브'] = 1
            cohesion_score['초과근무시간확대'] = 1
            cohesion_score['복지포인트'] = 1
            cohesion_score['인사우대'] = 1
            cohesion_score['근평가점'] = 1
            cohesion_score['주거이전수당'] = 1

            tokenizer = LTokenizer(scores=cohesion_score)
            data_wordcloud_soynlp['tokenizer'] = data_wordcloud_soynlp[
                'contents'].apply(
                    lambda x: tokenizer.tokenize(x, remove_r=True))

            words = list()
            for i in data_wordcloud_soynlp['tokenizer'].values:
                for j in i:
                    words.append(j)

            count_soynlp = Counter(words)
            words_dict_soynlp = dict(count_soynlp.most_common(100))  # 빈도 상위 n개

            csv_stopwords = pd.read_csv('stopwords.csv',
                                        encoding='cp949',
                                        skiprows=0)  # with open 변경
            stopwords = list()
            for i in csv_stopwords.values:
                for j in i:
                    stopwords.append(j)

            for word in stopwords:
                words_dict_soynlp.pop(word, None)

            wordcloud = WordCloud(
                font_path='NanumGothic.ttf',
                width=500,
                height=500,
                background_color='white').generate_from_frequencies(
                    words_dict_soynlp)

            plt.clf()
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud)
            plt.axis('off')
            #plt.show()
            plt.savefig(resultdir + filename_dateflag + target +
                        ' - wordcloud_soynlp.png',
                        dpi=100)
            '''
            # 빈도그래프(temp)
            plt.clf()
            plt.style.use('ggplot')
            plt.figure(figsize = (len(list(words_dict_soynlp.keys())[:20])*0.6, 10)) # grid size 가변화
            plt.title('상위 10개 빈출단어')
            plt.bar(list(words_dict_soynlp.keys())[:20], list(words_dict_soynlp.values())[:20])
            plt.xticks(rotation = 45, ha = 'right') # x축 라벨 회전
            plt.savefig(resultdir + filename_dateflag + target + ' - wordfrequency.png', dpi = 200)
            '''

        messagebox.showinfo(
            '작업', '워드클라우드(Soynlp) 생성이 완료되었습니다.\n\nresult폴더에 결과물이 저장되었습니다.')
    except Exception as e:
        Log(desc=e)
        messagebox.showerror('경고', str(e) + ' 열을 찾을 수 없습니다.')
Exemplo n.º 30
0
word_extractor = WordExtractor(min_frequency=100,
                               min_cohesion_forward=0.05,
                               min_right_branching_entropy=0.0)

word_extractor.train(train_data)
train_words = word_extractor.extract()
train_score = {
    word: score.cohesion_forward
    for word, score in train_words.items()
}
tokenizer = LTokenizer(scores=train_score)
train_list = []
cnt = 0
for sent in train_data:
    train_list.append([tokenizer.tokenize(sent), train_label[cnt]])
    cnt += 1

word_extractor.train(test_data)
test_words = word_extractor.extract()
test_score = {
    word: score.cohesion_forward
    for word, score in test_words.items()
}
tokenizer = LTokenizer(scores=test_score)
test_list = []
cnt = 0
for sent in test_data:
    test_list.append([tokenizer.tokenize(sent), test_label[cnt]])
    cnt += 1