def pos_tag(sentence, kuro_server): tagged_words = [] kuromoji = kuro_server.kuromoji tokenizer = kuromoji.Tokenizer.builder().build() tokens = tokenizer.tokenize(sentence) for elem in [[ x.getBaseForm() if x.getBaseForm() is not None else x.getSurfaceForm(), pos_interpreter(x.getPartOfSpeech()), x.getSurfaceForm(), x.getReading(), ] for x in tokens]: ru_or_u = ru_or_u_verb(elem) if ru_or_u: tagged_words.append([ elem[0], [*elem[1], ru_or_u], elem[2], romkan.to_roma(elem[0]), elem[3], romkan.to_roma(elem[3]) ]) else: try: tagged_words.append([ elem[0], elem[1], elem[2], romkan.to_roma(elem[0]) if elem[0] else "", elem[3], romkan.to_roma(elem[3]) ]) except: pass return tagged_words
def maybe_potential_form(self) -> Optional[str]: pos = self.pos_str() surface = self.surface() maybe_dform = None if (pos[0] == "v" and len(self.morphemes) == 1 and self.morphemes[0].dictionary_form() == surface and romkan.to_roma(surface).endswith("eru") and not jmdict_lookup(surface).entries): suf = romkan.to_hiragana(romkan.to_roma(surface[-2:])[:-3] + "u") maybe_dform = surface[:-2] + suf elif (pos[0] == "v" and romkan.to_roma(self.morphemes[0].surface()).endswith("e") and not jmdict_lookup(surface).entries): suf = romkan.to_hiragana( romkan.to_roma(self.morphemes[0].surface()[-1])[:-1] + "u") maybe_dform = self.morphemes[0].surface()[:-1] + suf if not maybe_dform: return maybe_pos: SudachiPos = parse(maybe_dform)[0].part_of_speech() if (surface not in merge_multi_dicts([ flip_multi_dict(m) for m in all_conjugations(maybe_dform, maybe_pos).values() ]).keys()): return if not jmdict_lookup(maybe_dform).entries: return return maybe_dform
def slugify(data: str) -> str: slug = "".join(s for s in data.strip() if s in ALLOWED) # if its just a date with underscores or spaces if set(slug.strip()).issubset(SLUG_DIGITS): # try to convert japanese text to romaji to prevent image clashes slug = "".join(s for s in romkan.to_roma(data).strip() if s in ALLOWED) return slug.replace(" ", "_").casefold().strip()
def __init__(self, latin = "", romaji = "", kana = "", display_name = "", type = "", hanabira_setsumei = "", hana_tsukikata = "", ha_tsukikata = "", ha_katachi = "", kyoshi = "", iro = "", hanabira_kazu = [0], shokudoku = [], kaki = [], seiikubasho = [], bunpu = [], kishibe_type = None): kana = try_unicode(kana) if not romaji and kana: romaji = romkan.to_roma(kana).replace("n'", "nn") elif not kana and romaji: kana = romkan.to_kana(romaji) self.romaji = romaji self.kana = kana self.latin = latin #self.latin = latin if latin else "" self.display_name = display_name if display_name else self.kana self.kaki = kaki self.bunpu = bunpu #self.kaki = kaki if kaki else [] #self.bunpu = bunpu if bunpu else [] self.kishibe_type = kishibe_type self.seiikubasho = seiikubasho #self.seiikubasho = seiikubasho if seiikubasho else [] self.type = type self.hanabira_kazu = hanabira_kazu self.hanabira_setsumei = hanabira_setsumei self.shokudoku = shokudoku self.hana_tsukikata = hana_tsukikata self.ha_tsukikata = ha_tsukikata self.ha_katachi = ha_katachi self.iro = iro self.kyoshi = kyoshi
def findMatch(line): alphabets = [] nonalphabets = [] romanized = [] words = filter(lambda word : word != '' and not re.search(r'\d', word),line.split(' ')) for word in words: if re.search('[a-zA-Z]' ,word) and len(word) > 1: alphabets.append(word) elif is_katakana(word): nonalphabets.append(word) romanized.append(romkan.to_roma(word)) dim = (len(alphabets), len(romanized)) similarity = numpy.zeros(dim) for i in range(len(alphabets)): for j in range(len(romanized)): alphabet_len = len(alphabets[i]) romanized_len = len(romanized[j]) max_len = max(alphabet_len, romanized_len) similarity[i][j] = numpy.linalg.norm(vectorize(alphabets[i], max_len)-vectorize(romanized[j], max_len)) # similarity[i][j] = distance.euclidean(vectorize(alphabets[i], max_len),vectorize(romanized[j], max_len)) ans = [] for i in range(min(dim[0], dim[1])): row_index = similarity.argmin() / similarity.shape[1] col_index = similarity.argmin() % similarity.shape[1] ans.append((alphabets[row_index],nonalphabets[col_index], similarity[row_index,col_index],line)) del alphabets[row_index] del nonalphabets[col_index] similarity = numpy.delete(similarity, row_index, 0) similarity = numpy.delete(similarity, col_index, 1) return ans
def kanji_to_romaji(self, text): convert = self.conv hiragana_text = convert.do(text) romaji_text = romkan.to_roma(hiragana_text) return (hiragana_text, romaji_text)
def findMatch(line): alphabets = [] nonalphabets = [] romanized = [] words = filter(lambda word: word != '' and not re.search(r'\d', word), line.split(' ')) for word in words: if re.search('[a-zA-Z]', word) and len(word) > 1: alphabets.append(word) elif has_katakana(word): nonalphabets.append(word) romanized.append(romkan.to_roma(word)) dim = (len(alphabets), len(romanized)) similarity = numpy.zeros(dim) for i in range(len(alphabets)): for j in range(len(romanized)): similarity[i][j] = distance.euclidean(vectorize(alphabets[i]), vectorize(romanized[j])) ans = [] if dim[1] > 0: for i in range(dim[0]): if min(similarity[i, :]) < 0.5: j = numpy.argmin(similarity[i, :]) ans.append((alphabets[i], nonalphabets[j])) return ans
def map_dict_form_to_different_ending(verb, romaji_ending, *special_endings): '''Generates Godan verb stem and computes the correct particle to attach based on the verb's last kana Args: verb (str): Japanese verb in kana, might contain kanji romaji_ending (str): target sound of the particle to append to the verb *special_endings: Variable length argument list. Based on the target Godan particle class (-a, -e, -i, -o). Order of particles is -u / -tsu / -su. Returns: str: verb stem with the correct particle attached depending on the last kana particle of the Godan verb ''' last_kana = splice_verb(verb, VerbClass.GODAN, False) verb_stem = splice_verb(verb, VerbClass.GODAN) if last_kana == U_PARTICLE: return "{}{}".format(verb_stem, special_endings[0]) elif last_kana == TSU_PARTICLE: return "{}{}".format(verb_stem, special_endings[1]) elif last_kana == SU_PARTICLE: return "{}{}".format(verb_stem, special_endings[2]) else: transformed_last_kana_as_romaji = "{}{}".format( romkan.to_roma(last_kana)[:-1], romaji_ending) return "{}{}".format( verb_stem, romkan.to_hiragana(transformed_last_kana_as_romaji))
def output_text_per_hom(OutJsonFP, Max=3000): def is_of_interest(Orths): #Bool=True KanjiOnly = False # at least 2 variations in orth if len(Orths) < 2: return False, None # at least two is kanji orths else: KanjiOrths = [ Orth for Orth in Orths if myModule.at_least_one_of_chartypes_p(Orth, ['han']) ] if len(KanjiOrths) < 2: return False, None elif len(KanjiOrths) == len(Orths): KanjiOnly = True return True, KanjiOnly LCnt = 7900 #myModule.get_linecount(OutJsonFP) TxtDir = os.path.join(os.path.dirname(OutJsonFP), os.path.basename(OutJsonFP) + '_txt') if not os.path.isdir(TxtDir): os.mkdir(TxtDir) CntSoFar = 0 Cntr = 0 CntThresh = LCnt / 1000 with open(OutJsonFP) as FSr: #print('retrieving homvecs for '+Hom+'...') while FSr or Cntr < Max: Ret = get_hom_in_file(FSr, OutJsonFP, FstPosition=CntSoFar) if Ret: FSr, OrthsVecs, Hom, Cnt, MultiToks = Ret else: break #except: # get_hom_in_file(FSr,OutJsonFP,FstPosition=CntSoFar) Orths = list(OrthsVecs.keys()) print('For ' + Hom + ', we found the following orths, ' + str(Cnt) + ' items') print(Orths) print(CntThresh) IsOfInt, KanjiOnly = is_of_interest(Orths) if not (Cnt > CntThresh and len(Hom.split(':')[0]) >= 2 and IsOfInt): print('not selected for printing\n') else: print('writing out...') RomHom = romkan.to_roma(Hom) OutHomFP = os.path.join(TxtDir, 'homvecs_' + RomHom) with open(OutHomFP, 'wt') as FSw: FSw.write(stringify_hom_vecs(OrthsVecs)) print('... done, fp: ' + OutHomFP) if KanjiOnly: RefClusterFP = OutHomFP + '.refclusters' with open(RefClusterFP, 'wt') as FSw: FSw.write('\t'.join(get_cluster_ref(OrthsVecs))) CntSoFar += Cnt Cntr += 1
def verify_reading(self, guess, readings=None): guess = romkan.to_roma(romkan.to_kana(guess.replace(' ', ''))) if not readings: readings = set( Association.objects.filter( expression=self.expression).values_list('reading', flat=True)) readings = map(romkan.to_roma, readings) return guess in readings
def make_roma_subname(name_ja): subname = re.sub(r'[=]', '', name_ja) subname = re.sub(r'[「」]', '・', subname) adjusted_subname = '' for part in subname.split('・'): roma_part = romkan.to_roma(part) if part != roma_part and not contains_ja(roma_part): adjusted_subname += ' ' + roma_part.strip('-') return adjusted_subname.strip()
def calculatePartsKana(self, aString): s = set() kana = romkan.to_katakana(romkan.to_roma(aString)) for e in aString.split(" "): s |= self.calculatePartsElement(kana) return self.serializeSet(s)
def get_romaji(sent): t = Tokenizer() readings = "" for token in t.tokenize(sent): surface = regex.split("[\t,]", str(token).decode('utf8'))[0] reading = regex.split("[\t,]", str(token).decode('utf8'))[-2] reading = surface if reading == "*" else reading readings += reading romaji = romkan.to_roma(readings) return romaji
def clean(self): if not self.hiragana: self.hiragana = romkan.to_hiragana(self.romaji) elif not self.romaji: self.romaji = romkan.to_roma(self.hiragana).capitalize() elif not self.hiragana and not self.romaji: raise ValidationError( "You have to enter either the Hiragana or Romaji of a Word") self.slug = slugify(self.romaji)
def lemmatize_with_mecab(expression, kanji): '''Find the first word containing kanji; return (lemma, reading).''' nodes = mecab_tagger.parseToNode(expression) while nodes: features = nodes.feature.split(',') if kanji in features[10]: lemma = features[10] reading = romkan.to_hiragana(romkan.to_roma(features[6])) return ((lemma, reading)) nodes = nodes.next raise (ValueError("Mecab failed: %s, %s" % (expression, kanji)))
def lemmatize_with_mecab(expression, kanji): '''Find the first word containing kanji; return (lemma, reading).''' nodes = mecab_tagger.parseToNode(expression) while nodes: features = nodes.feature.split(',') if kanji in features[10]: lemma = features[10] reading = romkan.to_hiragana(romkan.to_roma(features[6])) return((lemma, reading)) nodes = nodes.next raise(ValueError("Mecab failed: %s, %s" % (expression, kanji)))
def split_alpha(line): out = [] for char in line: if char in ('ァ', 'ィ', 'ゥ', 'ェ', 'ォ', 'ャ', 'ュ', 'ョ'): if out: out[-1] += char else: out.append(char) else: out.append(char) return [("ッ".join("ン".join(kana.split("N")).split("q")), romkan.to_roma(kana)) for kana in out]
def index(): text = request.query.getunicode('text', '') words = mc.parse(text).strip().split(' ') pronunciation = [jmdict.get(w, ([], []))[0] for w in words] english = [jmdict.get(w, ([], []))[1] for w in words] romaji = [[romkan.to_roma(w) for w in p] for p in pronunciation] return {"text": text, "words": words, "pronunciation": pronunciation, "romaji": romaji, "english": english}
def train(features): print "START TRAINING" worddic = dict() for linenum,line in enumerate(features): items = line.strip().split("\t") string = items[0].decode("utf8") string_roma = unicodedata.normalize("NFKC",romkan.to_roma(string)) freq = float(items[1]) worddic[string_roma] = freq if linenum % 10000 == 0: print "{:>2}%".format(linenum/10000) print "FINISH TRAINING\n" return worddic
def listen_japanese(self): pygame.mixer.init() for i, item in enumerate(self.get_japanese_items()): file_name = '%s.mp3' % romkan.to_roma(item.text().replace(' ', '_')) path = os.path.join(self.window().japanese_path(), file_name) self.soundfiles.append(path) if not os.path.exists(path): tts = gTTS(text=item.text(), lang='ja') tts.save(path) pygame.mixer.music.load(path) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): pygame.time.Clock().tick(20000)
def slugify(text): """Version of slugify that supports Japanese characters""" if not text: return "" slug = django_slugify(text) if not slug: # Title may be in Japanese slug = django_slugify(romkan.to_roma(text)) if not slug: # Title may be in Chinese pinyin = Pinyin() slug = django_slugify(pinyin.get_pinyin(text)) return slug[:50]
def train(features): print "START TRAINING" worddic = dict() for linenum, line in enumerate(features): items = line.strip().split("\t") string = items[0].decode("utf8") string_roma = unicodedata.normalize("NFKC", romkan.to_roma(string)) freq = float(items[1]) worddic[string_roma] = freq if linenum % 10000 == 0: print "{:>2}%".format(linenum / 10000) print "FINISH TRAINING\n" return worddic
def read_kanjidic(filepath): """Given path to kanjidic file, returns a dictionary of character readings by language, eg: char_dict["犬"] >> (['quan3', 'quan2'], ['gyeon'], ['ken']) """ char_dict = {} # Should have 6355 characters with open(filepath, encoding="u-jis") as f: for line in f: han = re.findall(re_han, line) if len(han) == 1: # Skip non dictionary entry lines char = han[0] # Character itself mandarin = re.findall(re_mandarin, line) hanja = re.findall(re_hanja, line) # Note: In Japanese, some characters have on-yomi but not kun-yomi, and vice-versa jp_onyomi = re.findall(re_katakana, line) # Sino-japanese reading(s) jp_kunyomi = re.findall(re_hiragana, line) # Native japanese reading(s) # Convert to Latin alphabet jp_onyomi = [romkan.to_roma(x) for x in jp_onyomi] jp_kunyomi = [romkan.to_roma(x) for x in jp_kunyomi] # Fix things like 瓩:キログラム being interpreted as onyomi b/c katakana usage for x in jp_onyomi: if len(x) > 6: jp_kunyomi += [x] jp_onyomi.remove(x) # Remove leading identifier character, eg: Ywo3 -> wo3 hanja = [x[1:] for x in hanja] mandarin = [x[1:] for x in mandarin] # Provide dummy values if one training lanaguage is missing a reading # eg: Learn mandarin pronunciation from just the hanjul # (Assumes Mandarin is training objective) if len(hanja) < 1: hanja = ["*"] if len(jp_onyomi) < 1: jp_onyomi = ["*"] char_dict[char] = (mandarin, hanja, jp_onyomi ) # Don't care about kunyomi return char_dict
def get_inserts(max_chunk=10000): inserts = [] parser = Parser(PATH_TO_EDICT2) i = 0 for e in parser.parse(): i += 1 e['english'] = [g['english'] for g in e['glosses']] e['romaji'] = romkan.to_roma(e['furigana']) e['common_boost'] = 2.0 if e['common'] is True else 1.0 inserts.append(e) if i % max_chunk == 0: yield inserts inserts = [] yield inserts
def default_to_hiragana(self, row, col): items = self.tableWidget.selectedItems() if items: for item in items: if item.column(): if not self.lang_mode: item.setText(romkan.to_hiragana(item.text())) item = self.tableWidget.item(row, col) if item: self.page_data['%s,%s' % (row, col)] = item.text() if col: self.page_data['%s,%s' % (row, col)] = romkan.to_roma(item.text()) self.data['page_%s' % self.window().pageLab.text()] = self.page_data self.update_config(self.data)
def get_inserts(max_chunk=10000): inserts = [] parser = Parser(PATH_TO_EDICT2) i = 0 for e in parser.parse(): i += 1 e['english'] = [g['english'] for g in e['glosses']] e['romaji'] = romkan.to_roma(e['furigana']) e['common_boost'] = 2.0 if e['common'] == True else 1.0 inserts.append(e) if i % max_chunk == 0: yield inserts inserts = [] yield inserts
def ru_or_u_verb(pos_tags): if 'verb' in pos_tags[1]: if pos_tags[0] in ["する", "くる"]: return "exception" if pos_tags[0][-1] != "る": return 'u-verb' roma = romkan.to_roma(pos_tags[0].replace(pos_tags[2], pos_tags[3])) if roma[-3] not in ['i', 'e']: return 'u-verb' else: if pos_tags[0] not in ru_verb_exceptions: return 'ru-verb' else: return 'u-verb'
def to_romaji(token): replace_dict = {'': ['a', 'e', 'i', 'o', 'u'], 'b': ['v'], 'p': ['f', 'h'], 'c': ['k'], 'l': ['r'], 's': ['z'], 'g': ['j'] } token = re.sub('[^a-z]', '', romkan.to_roma(token).lower()) for newtokens, tokens in replace_dict.iteritems(): for oldtoken in tokens: token = token.replace(oldtoken, newtokens) return token
def slugify(text): """Version of slugify that supports Japanese and Chinese characters""" if not text: return "" slug = django_slugify(text) if not slug: # Title may be in Japanese slug = django_slugify(romkan.to_roma(text)) if not slug: # Title may be in Chinese pinyin = Pinyin() slug = django_slugify(pinyin.get_pinyin(text)) if not slug: # Try transliterate which supports Cyryllic, Greek and other alphabets slug = django_slugify(translit(text, reversed=True)) return slug[:50]
def hiragana_candidates(word,num): if not isinstance(word,unicode): #unicode check word = word.decode("utf8") romaji = unicodedata.normalize("NFKC",romkan.to_roma(word)) print "romaji:{}".format(romaji) candidates = prob(romaji) + edit1_prob(romaji) + edit2_prob(romaji) if candidates: for i,word_prob_tuple in enumerate(sorted(candidates,key=lambda x :x[1],reverse=True)[:num]): romaji = word_prob_tuple[0] p = word_prob_tuple[1] kana = romkan.to_hiragana(romaji).encode("utf8") print " {} : {:<10}{:<20} {:<}".format(i+1,kana,"("+romaji+")",p) else: print "NO RESULT"
def decompose_hiraganas(hiraganas): '''Decompose the hiragana str into consonants & vowels''' alphabets = [] # Convert hiragans into Romaji, # then divide string (e.g. "abc") into list (e.g. ["a", "b", "c"]) alphabets[:0] = romkan.to_roma(hiraganas) vowels = [] for i, alphabet in enumerate(alphabets): if alphabet in ["a", "e", "i", "o", "u"]: vowels.append(alphabets.pop(i)) return {"consonants": alphabets, "vowels": vowels}
def phonemize(x): if x not in cachedPhonemization: phonemized = romkan.to_roma(x) if max([ord(y) for y in phonemized]) > 200: # contains Kanji cachedPhonemization[x] = x else: if x.endswith("っ"): assert phonemized.endswith("xtsu") phonemized = phonemized.replace("xtsu", "G") # G for `geminate' phonemized = phonemized.replace("ch", "C") phonemized = phonemized.replace("sh", "S") phonemized = phonemized.replace("ts", "T") cachedPhonemization[x] = phonemized phonemized = cachedPhonemization[x] return phonemized
def scrape_japanese_definitions(html, max_results=3): ''' Extract japanese kanji, kana, english definitions and parts of speech from html off jisho.org. Return the values as a list of strings. If nothing found, return None. ''' results = [] try: soup = BeautifulSoup(html) kanji = soup.findAll('td', {'class': 'kanji_column'}) kana = soup.findAll('td', {'class': 'kana_column'}) engrish = soup.findAll('td', {'class': 'meanings_column'}) if not kanji or not kana or not engrish: return None kanji = [' '.join(x.stripped_strings) for x in kanji] kana = [' '.join(x.stripped_strings) for x in kana] romaji = [romkan.to_roma(x) for x in kana] engrish = [elipsize(' '.join(x.stripped_strings)) for x in engrish] results = zip(kanji, kana, romaji, engrish) ''' #before forming final definitions string list from these sublists #we'll remove definitions which have identical english meanings??? results = [] for i,definition in enumerate(definitions): if len(results>0) and definition[3] in results[:i-1][3]: pass else: results.append(definition) ''' #form final results from zipped list and return ''' results = [u'{white}{kanji}{white}{kana}{white}{romaji}{white}{engrish}'.format( \ white= foreground(u'black') + background(u'white'), \ black=style(u'normal'), \ kanji=(u' '+x[0]+u' '+style(u'normal')+u' ' if x[0] else u''), \ kana=(u' '+x[1]+u' '+style(u'normal')+u' ' if x[1] else u''), \ romaji=(u' '+x[2]+u' '+style(u'normal')+u' ' if x[2] else u''), \ engrish=(u' '+x[3]+u' '+style(u'normal')+u' ' if x[3]else u'')) for x in results[:max_results]] ''' lookups = [Lookup(x[0],x[1],x[2],x[3]) for x in results[:max_results]] except: log.err() return lookups
def __init__( self, latin="", ka="", zoku="", romaji="", kana="", rarity=3, display_name="", masuda=None, takatsu=None ): kana = try_unicode(kana) if not romaji and kana: romaji = romkan.to_roma(kana).replace("n'", "nn") elif not kana and romaji: kana = romkan.to_kana(romaji) self.romaji = romaji self.kana = kana self.masuda = masuda if masuda else [] self.takatsu = takatsu if takatsu else [] self.latin = latin if latin else "" self.ka = ka if ka else "" self.zoku = zoku if zoku else "" self.rarity = rarity if rarity else 3 self.display_name = display_name if display_name else self.kana
def command_ja(self, event): '''Usage: ~ja <k/h/r> <arg> displays katakana/hiragana/romaji for a given argument, converting between romaji and kana''' try: dest, phrase = event.params.split(' ', 1) dest = dest.lower() if dest == 'k': resp = romkan.to_katakana(phrase) elif dest == 'h': resp = romkan.to_hiragana(phrase) elif dest == 'r': resp = romkan.to_roma(phrase.decode('utf-8')) else: raise self.send_message(event.respond, resp) except: self.send_message(event.respond, 'Invalid input, please check syntax.') raise
def main(SMecabCorpusDir, HomStats, Model, ModelType='cbow', Window=5, UpToPercent=None, OutDir=None, DoRTextP=True): OutFNStem = os.path.basename(SMecabCorpusDir) + '_contexts_mvecs' OutFN = OutFNStem + '_' + ModelType + '.json' PickedTokenStatsFN = OutFNStem + '_pickedtokenstats.pickle' OutJsonFP, PickedTokenStatsFP = [ (SMecabCorpusDir if OutDir is None else OutDir) + '/' + FN for FN in (OutFN, PickedTokenStatsFN) ] # print('finding mean vectors for contexts...') myModule.ask_filenoexist_execute( [OutJsonFP, PickedTokenStatsFP], get_homs_contexts_mvecs, ([SMecabCorpusDir, HomStats, Model, Window], { 'OutJsonFP': OutJsonFP })) if DoRTextP: TxtDir = os.path.join(os.path.dirname(OutJsonFP), OutFNStem + '_txtfiles') if not os.path.isdir(TxtDir): os.mkdir(TxtDir) SortedP = json_sorted_p(OutJsonFP) HomsOrthsCnts = myModule.load_pickle(PickedTokenStatsFP) HomsCnts = sorted([(Hom, sum(OrthsCnts.values())) for (Hom, OrthsCnts) in HomsOrthsCnts.items()]) CntSoFar = 0 for Cntr, (Hom, Cnt) in enumerate(HomsCnts): if Cntr > 1000: break OrthsVecs = get_hom_in_file(Hom, OutJsonFP, FstPosition=CntSoFar, AssumeSortedP=SortedP) RomHom = romkan.to_roma(Hom) OutHomFP = os.path.join(TxtDir, 'homvecs_' + RomHom) with open(OutHomFP, 'wt') as FSw: FSw.write(stringify_hom_vecs(OrthsVecs)) CntSoFar += Cnt
def create_parts(self, sentence, romas): func = "_noname_" analyzer = CaboChaAnalyzer() tree = analyzer.parse(sentence) l = [] mainPart = 0 for chunk in tree: for token in chunk: kan = token.feature.split(',')[-2] if kan == '*': kan = token.surface romas.append(romkan.to_roma(kan)) if chunk.link == -1: mainPart = chunk.id func = self.get_first_token(chunk) for chunk in tree: curword = chunk.tokens[0].surface curfeature = chunk.tokens[0].feature feat = self.analyse_feature(curfeature) if feat == '@num' or feat == '@n': curword = self.join_tokens(chunk) elif feat == '@nc': curword = self.join_nc_tokens(chunk) elif feat == '@v': parts = curfeature.split(',') raw = parts[-3] if raw != '*': curword = raw ## main part if chunk.link == -1: prefix = "" if feat == '@v': prefix = "act:" elif feat == '@adj': prefix = "desc:" elif feat == '@n': prefix = "prop:" l.append(prefix + "*" + curword + feat) elif chunk.link == mainPart: l.append(self.get_prefix(chunk) + "+" + curword + feat) else: l.append("." + curword + feat) result = func + '(' + ", ".join(l) + ')' return result
def hiragana_candidates(word, num): if not isinstance(word, unicode): #unicode check word = word.decode("utf8") romaji = unicodedata.normalize("NFKC", romkan.to_roma(word)) print "romaji:{}".format(romaji) candidates = prob(romaji) + edit1_prob(romaji) + edit2_prob(romaji) if candidates: for i, word_prob_tuple in enumerate( sorted(candidates, key=lambda x: x[1], reverse=True)[:num]): romaji = word_prob_tuple[0] p = word_prob_tuple[1] kana = romkan.to_hiragana(romaji).encode("utf8") print " {} : {:<10}{:<20} {:<}".format(i + 1, kana, "(" + romaji + ")", p) else: print "NO RESULT"
def getSongTitle(url): print(str(datetime.now()), ": Downloading song page", url) r = requests.get(url).text # Parse these formats, regardless of whitespace: # > 01 SONGNAME # > 1 SONGNAME # > 1. SONGNAME # > 01. SONGNAME # TODO: super fragile, replace with something more robust try: name = regex.findall(r'(?<=>[\s0]*1[.\s]+).+?(?=<)', r)[0] except Exception as e: print(url, e) name = "Unparsed" name = html.unescape(name.strip()) name = romkan.to_roma(name) return name
def findMatch(line): words = phrases.splitWord(line) alphabets = phrases.get_english_phrase(words) nonalphabets = phrases.generate_katakana_phrase(words) romanized = [] for nonalphabet in nonalphabets: romanized.append(romkan.to_roma(nonalphabet)) dim = (len(alphabets), len(romanized)) similarity = numpy.zeros(dim) for i in range(len(alphabets)): for j in range(len(romanized)): similarity[i][j] = distance.euclidean(vectorize(alphabets[i]),vectorize(romanized[j])) ans = [] if dim[1] > 0: for i in range(dim[0]): if min(similarity[i,:]) < 0.5: j = numpy.argmin(similarity[i,:]) ans.append((alphabets[i],nonalphabets[j], line)) return ans
def findMatch(line): words = phrases.splitWord(line) alphabets = phrases.get_english_phrase(words) nonalphabets = phrases.generate_katakana_phrase(words) romanized = [] for nonalphabet in nonalphabets: romanized.append(romkan.to_roma(nonalphabet)) dim = (len(alphabets), len(romanized)) similarity = numpy.zeros(dim) for i in range(len(alphabets)): for j in range(len(romanized)): similarity[i][j] = distance.euclidean(vectorize(alphabets[i]), vectorize(romanized[j])) ans = [] if dim[1] > 0: for i in range(dim[0]): if min(similarity[i, :]) < 0.5: j = numpy.argmin(similarity[i, :]) ans.append((alphabets[i], nonalphabets[j], line)) return ans
def findMatch(line): alphabets = [] nonalphabets = [] romanized = [] words = filter(lambda word: word != "" and not re.search(r"\d", word), line.split(" ")) for word in words: if re.search("[a-zA-Z]", word) and len(word) > 1: alphabets.append(word) elif has_katakana(word): nonalphabets.append(word) romanized.append(romkan.to_roma(word)) dim = (len(alphabets), len(romanized)) similarity = numpy.zeros(dim) for i in range(len(alphabets)): for j in range(len(romanized)): similarity[i][j] = distance.euclidean(vectorize(alphabets[i]), vectorize(romanized[j])) ans = [] if dim[1] > 0: for i in range(dim[0]): if min(similarity[i, :]) < 0.5: j = numpy.argmin(similarity[i, :]) ans.append((alphabets[i], nonalphabets[j])) return ans
def get_path(item_type, file_name=None, data=None): """Generates full path for the generated file using configuration and explicitly specified name or RSS item data. At least one argument should be specified. @file_name has higher priority during output path generation. Arguments: item_type -- 'post' or 'page'romkan.to_roma(transf.convert('汉字')) file_name -- explicitly defined correct file name. data -- preprocessed RSS item data dictionary.""" if not file_name and type(data) is not dict: raise Exception('File name or RSS item data dict should be defined') root = conf['dump_path'] root = root.format(date=time.strftime(conf['file_date_fmt']), year=time.strftime("%Y"), month=time.strftime("%m"), day=time.strftime("%d"), source=os.path.basename(conf['source_file'])) if file_name: relpath = file_name else: transf = CConvert() name = romkan.to_roma(transf.convert(data.get('title', '').strip())) name = name or data.get('post_id', UNTITLED) relpath = get_path_fmt(item_type, data) field = FIELD_MAP.get('post_date', 'post_date') post_date = data[field] relpath = relpath.format(year=time.strftime("%Y", post_date), month=time.strftime("%m", post_date), day=time.strftime("%d", post_date), name=name, title=name) return uniquify(os.path.join(os.path.abspath(root), relpath))
def to_hiragana(self): """Return the reading as hiragana, even if it's On. >>> k = Kanji('柔') >>> r = Reading(k, 'ニュウ') >>> r.to_hiragana() 'にゅう' If it's not On, it's imdepotent. >>> k = Kanji('最') >>> r = Reading(k, 'もっとも') >>> r.add_examples('最も') >>> r.reading 'もっと.も' >>> r.to_hiragana() 'もっと.も' """ if self.kind == 'On': return(romkan.to_hiragana(romkan.to_roma(self.reading))) else: return(self.reading)
correct = roma[:i] elif i - 1 == current[1]: current[1] = i else: result += "[" + answer[current[0]:(current[1] + 1)] + "]" + answer[(current[1] + 1):i] correct += " " + roma[current[0]:(current[1] + 1)] + " " + roma[(current[1] + 1):i] current = [i, i] if current is not None: result += "["+answer[current[0]:(current[1]+1)]+"]"+answer[(current[1]+1):] correct += " "+roma[current[0]:(current[1]+1)]+" " + roma[(current[1]+1):] return result, correct while True: i = random.randint(0, len(words)) to_write = words[i] roma = romkan.to_roma(to_write).strip() if mode == 2: to_write = romkan.to_katakana(roma)+"\n" tries = 0 while tries < max_tries: answer = input(to_write+"> ").strip() if answer == roma: print("\tcorrect!") break else: print("\tWRONG!") tries += 1 if tries == max_tries: errors, correct = find_error(answer, roma) print("\tAnswer was "+correct+"\n\tYou wrote: "+errors)
def to_romanji(self): self.lang_mode = 2 for item in self.get_japanese_items(): item.setText(romkan.to_roma(item.text()))
def multiscrape(self, name, shy=False): if shy and self.config.has_key(name) and self.config[name]['ja']: return if name != u'名前' and name != u'ふりがな': nodes = self.root.xpath("//_:li/_:strong[contains(text(), '%s')]/following-sibling::_:ul/_:li|//_:h4[contains(text(), '%s')]/following-sibling::_:p" % (name, name), namespaces=NS) else: nodes = self.root.xpath("//_:h3", namespaces=NS) if not nodes: return iterator = nodes[0].itertext() val = '' l = [] while 1: try: val = iterator.next() val = re.sub(u'^[ \r\n]+', '', val) val = re.sub(u'[ \r\n]+$', '', val) if val: l.append(val) except: break val = re.sub('^[ \n]*(.*?)[ \n]*$', '\\1', '\n'.join(l)) val = val.strip() val = makeHankaku(val) if name == u'名前': lst = val.split('\n') if not self.config.has_key(name): self.config[name] = {} self.config[name]['ja'] = lst[0] elif name == u'ふりがな' and not shy: if not self.config.has_key(u'名前'): self.config[u'名前'] = {} lst = val.split('\n') if len(lst) > 1: suzure = lst[1].replace(u' ', '').replace(' ', '') self.config[u'名前']['kana'] = lst[1] self.config[u'名前']['en'] = titleCase(lst[1]) self.config[u'並べ替え']['ja'] = romkan.to_katakana(romkan.to_kunrei(suzure)) self.config[u'並べ替え']['en'] = romkan.to_roma(suzure) else: self.config[u'名前']['kana'] = '' elif name == u'所属': if not self.config.has_key(u'所属'): self.config[u'所属'] = {} if self.staffType == 'LS' or self.staffType == 'PRO': self.config[u'所属']['ja'] = u'法科大学院' self.config[u'所属']['en'] = 'Law School (professional course)' elif name == u'役職': if not self.config.has_key(u'役職'): self.config[u'役職'] = {} self.config[u'役職']['ja'] = getPostJapanese(val) self.config[u'役職']['en'] = getPostEnglish(self.config[u'役職']['ja']) elif name == u'所属学会': if not self.config.has_key(u'学会'): self.config[u'学会'] = {} if len(val.split('\n')) > 1: self.config[u'学会']['ja'] = val.split('\n') else: self.config[u'学会']['ja'] = val elif name == u'教員からのメッセージ': if not self.config.has_key(u'法科大学院メッセージ'): self.config[u'法科大学院メッセージ'] = {} self.config[u'法科大学院メッセージ']['ja'] = val.split('\n') elif name == u'リンク': for node in nodes: subnode = node.xpath('.//_:a[@href]', namespaces=NS) if subnode and len(subnode): self.config[u'ホームページ']['ja'] = subnode[0].text self.config[u'ホームページ'][u'リンク'] = subnode[0].attrib['href'] break else: if not self.config.has_key(name): self.config[name] = {} if len(val.split('\n')) > 1: self.config[name]['ja'] = val.split('\n') if name == u'専門分野' and self.config[name]['ja'][0]: self.config[name]['en'] = fieldsMap[self.config[name]['ja'][0]] else: self.config[name]['ja'] = val if name == u'専門分野' and self.config[name]['ja']: self.config[name]['en'] = fieldsMap[self.config[name]['ja']]
import re import romkan entries = set() for i, entry in enumerate(open("edict2", encoding="euc-jp")): if i == 0: continue m = re.search("^[^/]*\\[([ぁ-んァ-ン]*)\\]", entry) if not m: continue entries.add(romkan.to_hiragana(romkan.to_roma(m.groups(1)[0]))) w = open("./hira.list", "w") for e in entries: w.write(e+"\n") w.close()
def ship_info(): # open the api_start2.json file with open("api_start2.json", "r") as f: json_data = json.load(f) # open the extra Ship.json file for evasion, LOS, and antisub, and others # sort by "index" (API ID, not card ID) with open("Ship.json", "r") as f: extra_ship_data = sorted(json.load(f), key=lambda k: k['index']) # loop through and rewrite ship info ships = json_data['api_data']['api_mst_ship'] new_ships = [] for ship in ships: # なし (nashi) means Null, an unused ID if romkan.to_roma(ship['api_name']) == "nashi": ships.remove(ship) continue # renaming keys: based on: https://kancolletool.github.io/docs/api/ mvk(ship, 'api_sortno', 'id') # use card number as apparent ID mvk(ship, 'api_id', 'api_id') # use API number as primary ID mvk(ship, 'api_name', 'name') mvk(ship, 'api_yomi', 'kana') # don't create romanizations of ships without kana if ship['kana'] != "": ship['name_roma'] = romkan.to_roma(ship['kana']) else: ship['name_roma'] = "" mvk(ship, 'api_stype', 'ship_class') mvk(ship, 'api_afterlv', 'remodel_min_lv') mvk(ship, 'api_aftershipid', 'remodel_ship_id') ship['remodel_ship_id'] = int(ship['remodel_ship_id']) # split up (base, max) stats sbm(ship, 'api_taik', 'hp') sbm(ship, 'api_souk', 'armor') sbm(ship, 'api_houg', 'firepower') sbm(ship, 'api_raig', 'torpedo') sbm(ship, 'api_tyku', 'antiair') sbm(ship, 'api_luck', 'luck') # derived variables from Ship.json # look through extra_ship_data for matching index, then grab data from there found = False for extra_ship in extra_ship_data: if (extra_ship['index'] == ship['api_id']): # ASW: Anti-sub ship['antisub'] = extra_ship['antisub'] # LOS: line-of-sight ship['line_of_sight'] = extra_ship['lineOfSight'] # evasion ship['evasion'] = extra_ship['evasion'] # illustrator if 'illustrator' in extra_ship.keys(): if extra_ship['illustrator'] != 0: ship['illustrator'] = extra_ship['illustrator'] else: ship['illustrator'] = "" else: ship['illustrator'] = "" # seiyuu: voice actor if 'cv' in extra_ship.keys(): if extra_ship['cv'] != 0: ship['seiyuu'] = extra_ship['cv'] else: ship['seiyuu'] = "" else: ship['seiyuu'] = "" # ship found, stop searching found = True break if found == False: # give default values if info not found ship['antisub'] = 0 ship['line_of_sight'] = 0 ship['evasion'] = 0 ship['illustrator'] = "" ship['seiyuu'] = "" #print(ship['api_id'], ship['name_roma'], extra_ship_data[ship['api_id'] - 1]) """ # optional variables, set to [] or 0 if nonexistent if 'api_tais' in ship: sbm(ship, 'api_tais', 'antisub') else: ship['antisub'] = [0, 0] if 'api_saku' in ship: sbm(ship, 'api_saku', 'line_of_sight') else: ship['line_of_sight'] = [0, 0] if 'api_kaih' in ship: sbm(ship, 'api_kaih', 'evasion') else: ship['evasion'] = [0, 0] """ mvk(ship, 'api_leng', 'range') mvk(ship, 'api_slot_num', 'equip_slots') mvk(ship, 'api_buildtime', 'build_time') mvk(ship, 'api_broken', 'scrap_value') mvk(ship, 'api_powup', 'feed_value') # stat power ups when fed for modernization mvk(ship, 'api_backs', 'rarity') mvk(ship, 'api_getmes', 'get_message') mvk(ship, 'api_afterfuel', 'remodel_fuel_cost') # apparently this is steel not fuel. The kancolle devs themselves may have misspelled it and neglected to fix it. mvk(ship, 'api_afterbull', 'remodel_ammo_cost') mvk(ship, 'api_fuel_max', 'max_fuel') mvk(ship, 'api_bull_max', 'max_ammo') mvk(ship, 'api_voicef', 'extra_voice_clips') # carrier data mvk(ship, 'api_maxeq', 'plane_capacity') # add to new JSON array new_ships.append(ship) return json.dumps(new_ships, indent=2, ensure_ascii=False)
def hira_toroma(word): return unicodedata.normalize("NFKC",romkan.to_roma(word.decode("utf8")))
def conv_line(line): try: return sub3(',', sub2('.', sub('', romkan.to_kunrei(romkan.to_roma(unicode(line, ('utf8'))).encode('utf8'))))) except: return ''
import psycopg2 execfile('jsonify.py') f = open('pwd.txt') user = f.readline().strip() pwd = f.readline().strip() f.close #ships = master ship list connect_name = "dbname='kancolle' user='******' host='localhost' password='******'" conn = psycopg2.connect(connect_name) cur = conn.cursor() stypes = final['api_mst_stype'] for ind in range(0,383): idnum = ships[ind]['api_id'] sortno = ships[ind]['api_sortno'] name = ships[ind]['api_name'] yomi = ships[ind]['api_yomi'] eego = romkan.to_roma(yomi) classind = ships[ind]['api_stype']-1 print 'INSERTING: ', idnum,' ', name if classind == 7: shipclass = u'\u9ad8\u901f\u6226\u8266' else: shipclass = stypes[classind]['api_name'] cur.execute('insert into kanmusu values(%s,%s,%s,%s,%s,%s);',(idnum,name,yomi,eego,sortno,shipclass)) conn.commit() cur.close() conn.close()
def _to_romaji(self): return romkan.to_roma(self.word.decode("utf-8"))
def to_romaji(word): """Convert a Japanese word in hiragana or katakana to romaji. """ return romkan.to_roma(hiragana_to_katakana(word))
# print >>f, cleanedWord kanji = cleanedWord # meaning = getMeaning(cleanedWord,f) # print >>f, meaning if i==1: hiragana = cleanedWord if "(" in hiragana or ")" or "、" or "・" in hiragana: hiragana = "" kanji="" i = i+1 if not(hiragana == "" and kanji == ""): print >>f, "<tr>" romaji = romkan.to_roma(hiragana.decode("utf-8")) if kanji =="": meaning = getMeaning(hiragana) else: meaning = getMeaning(kanji) print >>f, "<td>" + kanji + "</td>" print >>f, "<td>" + hiragana + "</td>" print >>f, "<td>" + romaji + "</td>" print >>f, "<td>" + meaning + "</td>" print >>f, "</tr>" print >>f , "</table>" f.close()
def multiscrape(self, name, shy=False): if shy and self.config.has_key(name) and self.config[name]['ja']: return if name != u'名前' and name != u'ふりがな': nodes = self.root.xpath("//_:h4[contains(text(), '%s')]/following-sibling::_:p" % name, namespaces=NS) else: nodes = self.root.xpath("//_:h3", namespaces=NS) if not nodes: return iterator = nodes[0].itertext() val = '' l = [] while 1: try: val = iterator.next() val = re.sub(u'^[ \r\n]+', '', val) val = re.sub(u'[ \r\n]+$', '', val) if val: l.append(val) except: break val = re.sub('^[ \n]*(.*?)[ \n]*$', '\\1', '\n'.join(l)) val = val.strip() val = makeHankaku(val) if name == u'名前': lst = val.split('\n') if not self.config.has_key(name): self.config[name] = {} self.config[name]['ja'] = lst[0] elif name == u'ふりがな' and not shy: if not self.config.has_key(u'名前'): self.config[u'名前'] = {} lst = val.split('\n') if len(lst) > 1: suzure = lst[1].replace(u' ', '').replace(' ', '') self.config[u'名前']['kana'] = lst[1] self.config[u'名前']['en'] = titleCase(romkan.to_hepburn(lst[1].replace(u' ', ' '))) self.config[u'並べ替え']['ja'] = romkan.to_katakana(romkan.to_kunrei(suzure)) self.config[u'並べ替え']['en'] = romkan.to_roma(suzure) else: self.config[u'名前']['kana'] = '' elif name == u'教員からのメッセージ': if not self.config.has_key(u'学部メセージ'): self.config[u'学部メッセージ'] = {} self.config[u'学部メッセージ']['ja'] = val.split('\n') elif name == u'役職': if not self.config.has_key(u'役職'): self.config[u'役職'] = {} self.config[u'役職']['ja'] = getPostJapanese(val) self.config[u'役職']['en'] = getPostEnglish(self.config[u'役職']['ja']) elif name == u'主要': if len(val.split('\n')) > 1: self.config[u'主要業績']['ja'] = val.split('\n') else: self.config[u'主要業績']['ja'] = val else: if not self.config.has_key(name): self.config[name] = {} if len(val.split('\n')) > 1: self.config[name]['ja'] = val.split('\n') if name == u'専門分野' and self.config[name]['ja'][0]: self.config[name]['en'] = fieldsMap[self.config[name]['ja'][0]] else: self.config[name]['ja'] = val if name == u'専門分野' and self.config[name]['ja']: self.config[name]['en'] = fieldsMap[self.config[name]['ja']]
playlist_tracks = [] for raw_info in playlist_track_infos: info = raw_info.split('!MAESTRO!') track_artist = info[0] track_name = info[1] album_name = info[2] track_id = info[3] playlist_tracks.append({"artist": track_artist, "name": track_name, "album": album_name, "track_id": track_id}) for track in playlist_tracks: match_in_name = handler.query.lower() in track["name"].lower() match_in_artist = handler.query.lower() in track["artist"].lower() match_in_album = handler.query.lower() in track["album"].lower() match_in_name_kana = handler.query.lower() in romkan.to_roma(unicode(track["name"], "utf-8")) match_in_artist_kana = handler.query.lower() in romkan.to_roma(unicode(track["artist"], "utf-8")) match_in_album_kana = handler.query.lower() in romkan.to_roma(unicode(track["album"], "utf-8")) if match_in_name or match_in_artist or match_in_album or match_in_name_kana or match_in_artist_kana or match_in_album_kana: subtitle = None if len(track["album"]) > 0: subtitle = "%s [%s]" % (track["artist"], track["album"]) else: subtitle = track["artist"] handler.add_new_item(title=track["name"], subtitle=subtitle, arg=track["track_id"], icon=get_artwork(track["name"])) anything_matched = True except: pass