def do_words_suit(a, b): if a[0] == b[0]: return True a_letters = jamo.decompose(a[0]) b_letters = jamo.decompose(b[0]) # skip dummy initial if a_letters[0] == 'ᄋ' and b_letters[0] == 'ᄋ': return a_letters[1] == b_letters[1] else: return a_letters[0] == b_letters[0]
def vocab(conll_path): jamosCount = Counter() charsCount = Counter() wordsCount = Counter() posCount = Counter() relCount = Counter() with open(conll_path, 'r') as conllFP: for sentence in read_conll(conllFP, True): chars = [] for node in sentence: if (node.norm == "*root*"): continue # No morphology there... for char in unicode(node.norm, "utf-8"): jamosCount.update(decompose(char)) chars.append(char) charsCount.update(chars) wordsCount.update([node.norm for node in sentence]) posCount.update([node.pos for node in sentence]) relCount.update([node.relation for node in sentence]) return (jamosCount, {j: i for i, j in enumerate(jamosCount.keys())}, charsCount, {c: i for i, c in enumerate(charsCount.keys()) }, wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys())
def get_present_determiner(word): stem1 = stem2.get_stem1(word) letters = jamo.decompose(stem1[-1]) if len(letters) == 3 and letters[2] == stem2.final_l: return stem1[:-1] + jamo.compose(letters[0], letters[1], None) + '는' else: return stem1 + '는'
def get_plain_interrogative(word: str): stem1 = stem2.get_stem1(word) letters = jamo.decompose(stem1[-1]) if len(letters) == 3 and letters[2] == stem2.final_l: return stem1[:-1] + jamo.compose(letters[0], letters[1], None) + '니' # '냐' else: return stem1 + '니' # '으' + '냐'
def get_leu_irregular_stem2(prefix): if len(prefix) == 0: raise RuntimeError('르다 is not a verb') letters = jamo.decompose(prefix[-1]) if len(letters) == 3: raise RuntimeError(f'{prefix}르다 is not a 르 verb') return prefix[:-1] + jamo.compose(letters[0], letters[1], final_l) + ( '라' if is_bright_vowel(letters[1]) else '러')
def get_eupsi(word): stem1 = stem2.get_stem1(word) letters = jamo.decompose(stem1[-1]) if len(letters) == 3 and letters[2] != stem2.final_l: return stem1 + '읍시' else: return stem1[:-1] + jamo.compose(letters[0], letters[1], stem2.final_p) + '시'
def get_seumni(word): stem1 = stem2.get_stem1(word) letters = jamo.decompose(stem1[-1]) if len(letters) == 3 and letters[2] != stem2.final_l: return stem1 + polite_formal_suffix else: return stem1[:-1] + jamo.compose(letters[0], letters[1], stem2.final_p) + '니'
def get_eu_stem2(stem1, initial): if stem1[-1] == '쓰': # 쓰다 derivatives always conjugate as 쓰다 return stem1[:-1] + '써' elif len(stem1) > 1: letters = jamo.decompose(stem1[-2]) if letters[1] == 'ᅡ': return stem1[:-1] + jamo.compose(initial, 'ᅡ', None) else: return stem1[:-1] + jamo.compose(initial, 'ᅥ', None)
def get_plain(word: str, adjective: bool): if adjective: return word stem1 = stem2.get_stem1(word) letters = jamo.decompose(stem1[-1]) if len(letters) == 2 or letters[2] == stem2.final_l: return stem1[:-1] + jamo.compose( letters[0], letters[1], stem2.final_n) + word_ending # final l -> n else: return stem1 + '는' + word_ending
def get(word, tense: int, irregular: bool): if tense == NounForm.PRESENT: st1 = stem2.get_stem1(word) letters = jamo.decompose(st1[-1]) if len(letters) == 3 and letters[2] == stem2.final_l: nominalization = (st1[:-1] + jamo.compose(letters[0], letters[1], 'ᆱ')) else: nominalization = stem2.get_stem1(word) + '음' return [nominalization, stem2.get_stem1(word) + '기'] elif tense == NounForm.PAST: past = get_past(word, irregular) return [past + '음', past + '기']
def get_regular_stem2(stem1): letters = jamo.decompose(stem1[-1]) vowel = letters[1] if len(letters) == 3: # if vowel in ('ᅩ', 'ᅣ', 'ᅪ'): return stem1 + ('아' if is_bright_vowel(vowel) else '어') else: if letters[1] == 'ᅳ': # consider not irregular return get_eu_stem2(stem1, letters[0]) replace_to = {'ᅩ': 'ᅪ', 'ᅮ': 'ᅯ', 'ᅵ': 'ᅧ'} vowel_to = replace_to.get(vowel) if vowel_to: return stem1[:-1] + jamo.compose(letters[0], vowel_to, None) else: return stem1
def get_irregular_stem3(stem1): letters = jamo.decompose(stem1[-1]) if len(letters) == 3: if letters[2] == stem2.final_s: return stem1[:-1] + jamo.compose(letters[0], letters[1], None) + stem3_final elif letters[2] == stem2.final_t: return stem1[:-1] + jamo.compose(letters[0], letters[1], stem2.final_l) + stem3_final elif letters[2] == stem2.final_p: return stem1[:-1] + jamo.compose(letters[0], letters[1], None) + '우' elif letters[2] == stem2.final_l: return stem1[:-1] + jamo.compose(letters[0], letters[1], None) raise RuntimeError( f'{stem2.stem1_to_word(stem1)} cannot be irregular verb')
def get_irregular_stem2(stem1): letters = jamo.decompose(stem1[-1]) if stem1[-1] == '르': return get_leu_irregular_stem2(stem1[:-1]) elif len(letters) == 2 and letters[1] == 'ᅳ': return get_eu_stem2(stem1, letters[0]) elif len(letters) == 3: if letters[2] == final_t: return get_t_irregular_stem2(stem1[:-1], letters[0], letters[1]) elif letters[2] == final_l: return get_regular_stem2(stem1) elif letters[2] == final_p: return get_p_irregular_stem2(stem1[:-1], letters[0], letters[1]) elif letters[2] == final_s: return get_s_irregular_stem2(stem1[:-1], letters[0], letters[1]) elif letters[2] == final_h: return get_h_irregular_stem2(stem1[:-1], letters[0]) else: raise RuntimeError(f'{stem1}다 cannot be irregular')
def get_past_and_future_determiner(word, irregular, regular_ending, p_irregular_ending, ending_final): stem1 = stem2.get_stem1(word) letters = jamo.decompose(stem1[-1]) if irregular and len(letters) == 3: if letters[2] == stem2.final_s: return stem1[:-1] + jamo.compose( letters[0], letters[1], None) + regular_ending # s removed elif letters[2] == stem2.final_t: return stem1[:-1] + jamo.compose( letters[0], letters[1], stem2.final_l) + regular_ending # t -> l elif letters[2] == stem2.final_p: return stem1[:-1] + jamo.compose( letters[0], letters[1], None) + p_irregular_ending # p -> un/ul if len(letters) == 2 or letters[2] == stem2.final_l or letters[ 2] == stem2.final_h: return stem1[:-1] + jamo.compose(letters[0], letters[1], ending_final) else: return stem1 + regular_ending
def getJamoVec(self, char, train): if not char in self.jamo_cache: self.jamo_cache[char] = decompose(char) jamos = self.jamo_cache[char] if len(jamos) == 1: # Non-Hangul (ex: @, Q) symbol = jamos[0] symbol_count = float(self.jamosCount.get(symbol, 0)) dropFlag = not train or \ (random.random() < (symbol_count/(0.25+symbol_count))) # 0: unknown symbol jamo_index = int(self.jvocab.get(symbol, 0)) if dropFlag else 0 return self.jamoLookup[jamo_index] # Hangul character jamo1vec = self.keepOrDropJamo(jamos[0], train) jamo2vec = self.keepOrDropJamo(jamos[1], train) jamo3vec = self.keepOrDropJamo(jamos[2], train) if len(jamos) > 2 else \ self.jamoLookup[2] # 2: empty consonant jamoinput = concatenate([ jamo1vec, jamo2vec, jamo3vec ]) jamovec = self.activation(self.jamoLayer.expr() * jamoinput + self.jamoBias.expr()) return jamovec
} if __name__ == '__main__': jamos, j2i, chars, c2i, words, w2i, pos, rels = vocab(sys.argv[1]) print print '# words: ', len(w2i) print ' '.join(words.keys()[-min(100, len(words)):]) print print '# chars: ', len(c2i) print ' '.join(chars.keys()[-min(100, len(chars)):]) print hangul_chars = {} for char in chars: if len(decompose(char)) > 1: hangul_chars[char] = True print '# Hangul chars: ', len(hangul_chars) print ' '.join(hangul_chars.keys()[-min(100, len(hangul_chars)):]) print print '# jamos: ', len(j2i) print ' '.join(jamos.keys()[-min(100, len(jamos)):]) print hangul_jamos = {} for jamo in jamos: if is_jamo(jamo): hangul_jamos[jamo] = True
jamos_train, j2i_train, chars_train, c2i_train, words_train, w2i_train, pos_train, rels_train = utils.vocab( sys.argv[1]) jamos_dev, j2i_dev, chars_dev, c2i_dev, words_dev, w2i_dev, pos_dev, rels_dev = utils.vocab( sys.argv[2]) oov_word = 0 for word in words_dev: if not word in words_train: oov_word += 1 print 'OOV word: ', oov_word, ' / ', len( words_dev), ' ', float(oov_word) / len(words_dev) * 100 hangul_chars_train = {} for char in chars_train: if len(jpack.decompose(char)) > 1: hangul_chars_train[char] = True hangul_chars_dev = {} for char in chars_dev: if len(jpack.decompose(char)) > 1: hangul_chars_dev[char] = True oov_char = 0 for char in hangul_chars_dev: if not char in hangul_chars_train: oov_char += 1 print 'OOV char: ', oov_char, ' / ', len( hangul_chars_dev), ' ', float(oov_char) / len(hangul_chars_dev) * 100 hangul_jamos_train = {} for jamo in jamos_train:
num_new_jamos = 0 for word in external_embedding: # expand word vocab if not word in words: num_new_words += 1 words[word] = 1 new_w = len(w2i) w2i[word] = new_w for char in unicode(word, "utf-8"): # expand char vocab if not char in chars: num_new_chars += 1 chars[char] = 1 new_c = len(c2i) c2i[char] = new_c for jamo in decompose(char): # expand jamo vocab if not jamo in jamos: num_new_jamos += 1 jamos[jamo] = 1 new_j = len(j2i) j2i[jamo] = new_j print 'Have {0} new words, {1} new chars, {2} new jamos from pretrained embeddings'.format( num_new_words, num_new_chars, num_new_jamos) if not os.path.exists(options.output): os.makedirs(options.output) # Make directory if needed with open(os.path.join(options.output, "params.pickle"), 'w') as paramsfp: pickle.dump( (jamos, j2i, chars, c2i, words, w2i, pos, rels, options),
def get_regular_stem3(stem1): letters = jamo.decompose(stem1[-1]) if len(letters) == 3: return stem1 + stem3_final else: return stem1
def recover_particle(word): I, V, F = jamo.decompose(word[-1]) if F == 'ᆻ': return 'ᆻ' else: return word[-1]
def get_past(word, irregular): stem = stem2.get_stem2(word, irregular) letters = jamo.decompose(stem[-1]) assert (len(letters) == 2) return stem[:-1] + jamo.compose(letters[0], letters[1], final_ss)