def text2InstaDmSmiling(text): EMOJI = '^_^' little_dashes = "ゔがぎぐげござじずぜぞだぢづでどばびぶべぼ" little_dashes_conv = "うかきくけこさしすせそたちつてとはひふへほ" little_circle = "ぱぴぷぺぽ" little_circle_conv = "はひふへほ" small_letter = "ぁぃぅぇぉゃゅょっ" small_letter_conv = "あいうえおやゆよつ" little_dashes_dict = {} for i in range(len(little_dashes)): little_dashes_dict[little_dashes[i]] = little_dashes_conv[i] + EMOJI little_circle_dict = {} for i in range(len(little_circle)): little_circle_dict[little_circle[i]] = little_circle_conv[i] + EMOJI small_letter_dict = {} for i in range(len(small_letter)): small_letter_dict[small_letter[i]] = small_letter_conv[i] + EMOJI kakasi.setMode('J', 'H') kakasi.setMode('K', 'H') conv = kakasi.getConverter() text_hiragana = conv.do(text) text_hiragana = text_hiragana.translate(str.maketrans(little_dashes_dict)) text_hiragana = text_hiragana.translate(str.maketrans(little_circle_dict)) text_hiragana = text_hiragana.translate(str.maketrans(small_letter_dict)) return text_hiragana
def calc_speak_time(df, kakasi, speaker_id=None): """ :param df: Pandas DataFrame :param kakasi: Kakasi Instance :param speaker_id: String :return: Pandas DataFrame """ # ひらがなへのconverter準備 kakasi.setMode('J', 'H') conv = kakasi.getConverter() speak_scores = [] for i in df.index: speak_time = float(df.end_time[i]) - float(df.start_time[i]) speak_length = len(conv.do(df.transcript[i])) speak_score = speak_time / speak_length speak_scores.append(speak_score) df['speak_scores'] = speak_scores if speaker_id is not None: if speaker_id not in df.speaker.unique(): raise Exception( str(speaker_id) + ' does not exist in speaker column ') else: df = df[df.speaker == str(speaker_id)] df.reset_index(drop=True, inplace=True) return df
def parse_topics(self, response): # 全角をローマ字に変換してくれるコンバータ(pykakasi)を準備 from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() # アイテムクラス"birditem"に画像ファイルごとの情報を格納 jpgpath = response.xpath( '//*[@id="contents"]/div[3]/div/p/img/@src').extract_first() birditem = BirdfanItem() # 画像ファイルを持つページのURL birditem['url'] = response.url # 画像ファイルを持つページのタイトル(人間向けのほう) birditem['title'] = response.xpath( '//*[@id="contents"]/div[3]/h2/a/text()').extract_first() # 画像ファイルの野鳥の種別名(全角)をローマ字に変換して格納 birditem['birdname'] = conv.do( response.xpath('//*[@id="contents"]/div[3]/div/div/h3/a/text()'). extract_first()) # 画像ファイルのURL birditem['jpgurl'] = response.urljoin(jpgpath) yield birditem
def run_kakasi(text_input): global conv if not conv: # kakasi set up from the pip page for this module. # using only setMode(J, a), (r, Hepburn) (s, False) opts # is buggy ¯\_(ツ)_/¯ from pykakasi import kakasi kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", False) # add space, default: no separator conv = kakasi.getConverter() # Use mecab instead of kakasi's wakati feature # to do spacing between Japanese kanji # because mecab has better spacing prediction spaced = mecab(text_input) chars = spaced.split(' ') def replacer(word): return 'わ' if word == 'は' else word spaced = ' '.join(list(map(replacer, chars))) result = conv.do(spaced) return result
def exchange_word(text): kakasi.setMode("H", "a") # Hiragana to ascii kakasi.setMode("K", "a") # Katakana to ascii kakasi.setMode("J", "a") # Japanese(kanji) to ascii conv = kakasi.getConverter() result = conv.do(text) return result
def readDict(): #日本語辞書("./dictionary/nihongolist.xlsx")の読み込み #読み込んだあと、"dictionary"内にnihongolist.binaryfileでpickle保存する。 #2回目以降はnihongolist.binaryfileを読み込む。 #辞書を更新したあとは、nihongolist.binaryfileを削除してください。 if os.path.isfile('./dictionary/nihongolist.binaryfile'): with open('./dictionary/nihongolist.binaryfile', 'rb') as web: df = pickle.load(web) else: df=pd.read_excel("./dictionary/nihongolist.xlsx") df["romaji"]="_" df["score"]=0.000 from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') conv = kakasi.getConverter() for n in range (len(df)): #print(df.iat[n,1]) romaji=conv.do(df.iat[n,1]) romaji=romaji.replace("a","aaa").replace("i","iii").replace("u","uuu").replace("e","eee").replace("o","ooo") df.iat[n,5]=romaji #print(df.iat[n,5]) with open('./dictionary/nihongolist.binaryfile', 'wb') as web: pickle.dump(df , web) return df #readDict()
def dialog_nlp(input_txt, version): if version == 't5': DIALOG_NLP_CONTAINER_NAME = "0e3490a65e84" proc = subprocess.run( f"docker start {DIALOG_NLP_CONTAINER_NAME}", shell=True) with open("dialogue/t5/question/question.txt", "w") as question_txt: question_txt.write(input_txt) with open("intermediate/nlp_out.txt", "w") as output_txt: # import os # os.system( # f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME} python3 test.py") proc_1 = subprocess.run( f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME} python3 test.py", shell=True, stdout=output_txt, text=True) print(proc_1.stdout) # TODO: text outprocess(extract only answer in english) from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() with open("intermediate/nlp_out.txt", "r") as f: responce = f.readlines()[1].replace( "<pad>", '').replace('</s>', '') print(responce) responce = conv.do(responce) print(responce) with open("intermediate/nlp_out_fixed.txt", "w") as f: f.write( "jsut_ver1.1/onomatopee300/wav/ONOMATOPEE300_300.wav|"+responce[1:].replace('\n', '')+".|1")
def to_romaji(text_jpn): text = ' '.join(tinysegmenter.tokenize(text_jpn)) kakasi.setMode("H", "a") # Hiragana ke romaji kakasi.setMode("K", "a") # Katakana ke romaji kakasi.setMode("J", "a") # Japanese ke romaji kakasi.setMode("r", "Hepburn") # default: Hepburn Roman table\ convert = (kakasi.getConverter()).do(text) return convert
def jp_to_romen(text): kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() answer = conv.do(text) answer2 = answer.capitalize() return answer2
def WordConvert(word): from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('J', 'H') kakasi.setMode("K", "H") conv = kakasi.getConverter() return conv.do(word)
def to_katakana(self, text): from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('K', 'K') kakasi.setMode('H', 'K') kakasi.setMode('E', 'K') kakasi.setMode('J', 'K') kakasi.setMode('a', 'K') converter = kakasi.getConverter() return converter.do(text.decode('utf-8'))
def change_word(read_data): from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() return conv.do(read_data)
def formats(key_world): key_world = key_world.strip() key_world = re.sub(r'[^\w\s]', '', key_world) from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() key_world = conv.do(key_world) return key_world
def get_reading_kakasi(word): """Gets reading for a given Japanese word by using kakasi. The reading in hiragana is returned by this function.""" import pykakasi.kakasi as kakasi kakasi = kakasi() kakasi.setMode("J", "H") kakasi.setMode("C", True) # default: Separator kakasi.setMode("c", False) # default: no Capitalize conv = kakasi.getConverter() result = conv.do(word) return result
def getConverter(): import sys # reload()之前必须要引入模块 reload(sys) sys.setdefaultencoding('utf-8') # 防止UTF8出问题 from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() return conv
class Phrase(pygame.sprite.Sprite): font = pygame.font.Font('migu-1m-regular.ttf', 32) kakasi = kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") kakasi.setMode("r", "Kunrei") conv = kakasi.getConverter() def __init__(self, y, string): pygame.sprite.Sprite.__init__(self, self.containers) self.characters_roman = [c for c in self.conv.do(string)] # 日本語/ローマ字のうち、より幅の大きい方に画像の幅を合わせる character_size = self.font.size( max(self.conv.do(string), string, key=lambda x: len(x))) surface = pygame.Surface((character_size[0], character_size[1] * 2)) surface.set_colorkey((0, 0, 0)) self.characters = self.font.render(string, True, (1, 1, 1), (255, 255, 255)) self.image = surface self.rect = self.image.get_rect() self.rect.midleft = (640, y) self.speed = -2.0 # 文字列内の文字の参照位置 self.next_character_pos = 0 def update(self): self.rect.move_ip(self.speed, 0) self.image.fill((255, 255, 255)) self.image.blit(self.characters, (0, 0)) if len(self.characters_roman) == self.next_character_pos: self.kill() for i, c in enumerate(self.characters_roman): if not c: continue self.image.blit( self.font.render(c, True, (1, 1, 1), (255, 255, 255)), (i * 16, 32)) if self.rect.right < 0: self.rect.left = 640 def input(self, character): if self.characters_roman[self.next_character_pos] == character: self.characters_roman[self.next_character_pos] = '' self.next_character_pos += 1 Explosion((self.rect.left + (self.next_character_pos * 16), self.rect.centery))
def make_filename(title): from pykakasi import kakasi, wakati import zen2han kakasi = kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") conv = kakasi.getConverter() title = conv.do(title).replace(" ", "_") return zen2han.zen2han(title)
def main(): kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() with serial.Serial(port="/dev/tty.usbmodem141141", baudrate=9600, timeout=1) as device: while True: title = itunes.current_track.name.get() artist = itunes.current_track.artist.get() time = int(itunes.player_position.get()) minute = 0; if time >= 60: while time >= 60: time = time - 60 minute = minute + 1 if minute < 10: time_str = str(0) elif minute == 0: time_str = str(0) + str(0) time_str = time_str + str(minute) if time < 10: time_str = time_str + str(0) + str(time) else: time_str = time_str + str(time) else: time_str = str(0) + str(0) if time < 10: time_str = time_str + str(0) + str(time) else: time_str = time_str + str(time) sleep(0.5) device.write(conv.do(title).encode(errors="ignore") + '\n') device.write(conv.do(artist).encode(errors="ignore") + '\n') device.write(time_str + '\n') os.system('clear') print("TITLE = " + title) print("ARTIST = " + artist) print("TIME = " + time_str) device.close()
def toKatakana(str): kakasi.setMode("J", "K") # a,H, K, None - roman, Hiragana, Katakana, or non conversion, default: no conversion kakasi.setMode("H", "K") # a,H,None - roman, Hiragana or non conversion, default: no conversion kakasi.setMode("a", "K") # a,H,None - roman, Hiragana or non conversion, default: no conversion kakasi.setMode("K", "K") conv = kakasi.getConverter() return conv.do(str)
def toFurigana(str): kakasi = pykakasi.kakasi() kakasi.setMode("J", "aF") # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion kakasi.setMode("H", "aF") # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion kakasi.setMode("K", "aF") # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion conv = kakasi.getConverter() return tokenTranslation(conv, str)
def __init__(self, dictionary_type='alphabet', max_len=1000, length=1024): if dictionary_type == 'alphabet': self.dictionary = alphabet.alphabet_dict elif dictionary_type == 'katakana': self.dictionary = katakana.katakana_dict elif dictionary_type == 'katakana_small': self.dictionary = katakana_small.katakana_small_dict self.dictionary_type = dictionary_type self.max_len = max_len self.length = length kakasi = pykakasi.kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') self.kakasi_conv = kakasi.getConverter()
def get_initials(words): kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') # kakasi.setMode("C", True) conv = kakasi.getConverter() initials = [] for word in words: initial = conv.do(word)[:1].upper() initials.append(initial) # if initials.count == 0: # print('Error: cannot get initial') # return return initials
def VoiceRecodeAndRecongnize(): p = pyaudio.PyAudio() start = input("録音開始 [Enter]>>") print("録音中...") stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=chunk) sequence = [] # while True: # data = stream.read(chunk) # sequence.append(data) # if keyboard.is_pressed("esc"): # break for i in range(0, int(RATE / chunk * 3)): data = stream.read(chunk) sequence.append(data) print("録音終了") stream.close() p.terminate() wavFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wavFile.setnchannels(CHANNELS) wavFile.setsampwidth(p.get_sample_size(FORMAT)) wavFile.setframerate(RATE) wavFile.writeframes(b"".join(sequence)) wavFile.close() r = sr.Recognizer() with sr.AudioFile(WAVE_OUTPUT_FILENAME) as source: audio = r.record(source) word = r.recognize_google(audio, language='ja') #kakasi = kakasi() kakasi.setMode('J', 'H') kakasi.setMode('K', 'H') conv = kakasi.getConverter() word_hiragana = conv.do(word) print("----------------------------------------") print("認識結果:" + word) print("認識結果(ひらがなver):" + word_hiragana) print("----------------------------------------") return word_hiragana
def furiganaLineTrans(text): kakasi.setMode("J","H") kakasi.setMode("r","Hepburn") conv = kakasi.getConverter() stitchStr = '' for segWord in segmenter.tokenize(text): result = "" if transDict.has_key(segWord): result = transDict[segWord] else: result = conv.do(segWord) if segWord == result: stitchStr += segWord else: result = result.strip().replace("\n","") useStr = segWord + ' { ' + result +' } ' stitchStr += useStr return stitchStr
def change_char(tweet_text, kakasi): '''change tweet_text, Kanji -> Hiragana''' seperator = "。" sentence_list = tweet_text.split(seperator) sentence_list.pop() sentence_list = [x+seperator for x in sentence_list] kakasi = kakasi() kakasi.setMode("J", "H") # J(漢字) からH(ひらがな)へ conv = kakasi.getConverter() for sentence in sentence_list: print(sentence) print(conv.do(sentence)) print() kana_text = conv.do(tweet_text) with open("prepare_text.txt", mode="w", encoding="utf-8") as f: f.write(kana_text) return kana_text
import sys import os import tweepy sys.path.append('/app') import models # noqa import jaconv # noqa from util import morpheme # noqa import services # noqa db = firestore.Client() kakasi = kakasi() kakasi.setMode("J", "H") conv = kakasi.getConverter() system_service = services.system_instance word_service = services.word_instance user_service = services.user_instance tag_service = services.tag_instance # user_service.get_oauth_url() # user_data = models.UserUpdate() # user_data.twitter_id = 'user_id' # user_data.twitter_name= 'screen_name' # user_data.twitter_key= 'oauth_token' # user_data.twitter_secret= 'oauth_token_secret' # user_data.session_id = "aaaa" # print(user_data)
incorrect_answers.append(output_string) student_output_filepath = os.path.join(output_directory, student_name) with open(student_output_filepath, "w+") as student_corrections_file: student_corrections_file.write("\n".join(incorrect_answers)) # ------------------------------------------------------------------------------ current_directory = os.path.dirname(os.path.abspath(__file__)) # CHANGE THIS BIT! master_answer_key = os.path.join(current_directory, "answer_keys/2017_06_08_lesson_03_part_02.txt") student_answer_sets = os.path.join(current_directory, "answer_sets") # CHANGE THIS BIT! date_string = "2017_06_08" current_student_answer_sets = os.path.join(student_answer_sets, date_string, "student_answers") output_directory = os.path.join(student_answer_sets, date_string, "graded_answers") # --- kakasi = kakasi() kakasi.setMode("J", "H") # default: Japanese no conversion japanese_text_normalizer = kakasi.getConverter() # --- perform_grading(master_answer_key, current_student_answer_sets, output_directory, japanese_text_normalizer)
def dabiaoqian(path): from pykakasi import kakasi import csv, os name_tezheng = 'mizhichuli_log' # 装有特征值的那个文件的文件名 xinde = 'xinde_mizhichuli' # 装入新的特征值的文件名 name1 = 'align1' name2 = 'symbol.txt' #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 biaozhiwenjian = csv.reader( open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1)): # 这里的每一轮可以为一个语音文件打标签 try: biaozhi = biaozhiwenjian_1[i][0] except: continue if 'id:' in biaozhi: ID = '' l_biaozhi = [] l_zhengjie = [] l_zhengjie_1 = [] l_jieguo = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() l_biaozhi.pop(0) #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 jishuqi_biaozhi = 0 for i in l_biaozhi: if i == "D": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') jishuqi_zhengjie += 1 jishuqi_biaozhi += 1 if i == "C": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "I": l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "S": #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie] jieguo_hanzi = l_jieguo[jishuqi_jieguo] #先处理识别结果 if conv.do( jieguo_hanzi ) == jieguo_hanzi and jieguo_hanzi != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(jieguo_hanzi))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( jieguo_hanzi)) else: zhuanhuan_jieguo = conv.do(jieguo_hanzi) #再处理正解文 if conv.do( zhengjie_hanzi ) == zhengjie_hanzi and zhengjie_hanzi != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(zhengjie_hanzi))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( zhengjie_hanzi)) else: zhuanhuan_zhengjie = conv.do(zhengjie_hanzi) if zhuanhuan_jieguo == zhuanhuan_zhengjie: # print("正解list") # print(l_zhengjie_1) # # print("识别结果list") # print(l_jieguo_1) # # print("zhuanhuan_jieguo") # print(zhuanhuan_jieguo) # print("zhuanhuan_zhengjie") # print(zhuanhuan_zhengjie) # print("有标志被改了") # print(ID) # os.system("pause") l_biaozhi[jishuqi_biaozhi] = 'C' jishuqi_biaozhi += 1 jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') #读出.out文件 dianout = pi.read_out(path_out_1) start = dianout.pop(0)[1][1] # 给开始的无音区间打标签9,pop掉第一个元素 start_1 = dianout[-1][1][0] #给末尾句号打标签9 # end_1 = dianout.pop(-1)[1][1] # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader( open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] end_1 = len(t_file_list) - 1 for i in range(start + 1): t_file_list[i].insert(0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for i in range(start_1, end_1 + 1): t_file_list[i].insert(0, '9') l_jieguo_1.pop(-1) #最后句号的部分已经打过标签了,需要把它pop掉 print("ID") print(ID) print("l_biaozhi") print(l_biaozhi) print("l_jieguo_1") print(l_jieguo_1) print("dianout") print(dianout) dianout_chongzao = cz.chongzao(l_biaozhi, l_jieguo_1, dianout, ID) # 生成新的dianoutlist,以后就靠它了 print('dianout_chongzao') print(dianout_chongzao) #通过得到的新的list,开始打标签 # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'], # ['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'], # ['で', [191, 209], 'C']] for i in dianout_chongzao: start, end = i[1] if i[2] == 'C': for i in range(start, end + 1): t_file_list[i].insert(0, '0') else: for i in range(start, end + 1): t_file_list[i].insert(0, '1') path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉
from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() print(conv.do('本日は晴天なり'))
def convert(self, inputText): input = inputText input = input.replace(" ", "**SPACE**") lines = input.splitlines() ## Prepare response with dict romanized = [] for line in lines: text = line chunklines = mecab_tagger.parse(text).splitlines()[:-1] parsed = [[chunk.split('\t')[0], tuple(chunk.split('\t')[1].split(',')) ] for chunk in chunklines] ## Parse romanizedLine = [] for i in parsed: #now for each i[0] do romaji conv = kakasi.getConverter() finalResult = None # ignore calculation if initial string is numeric if is_number(i[0]): finalResult = ""+i[0] # ignore calculation if string has non JP chars if finalResult == None and is_japanese(i[0]) == False: finalResult = i[0] if finalResult == None: result1 = None if len(i) == 2 and len(i[1]) > 8: result1 = conv.do(i[1][7]) result2 = conv.do(i[0]) if result1 == None: finalResult = result2+" " elif result1 != None and result2 != result1: finalResult = result2+" " else: finalResult = result2+" " # print("r1 "+result1) # print("r2 "+result2) romanizedLine.append(finalResult) pair = {} romanizedLine = "".join(romanizedLine) romanizedLine = romanizedLine.replace(" ha ", " wa ") ## Collapse っ #k romanizedLine = romanizedLine.replace("tsu ka ", "tsuka") romanizedLine = romanizedLine.replace("tsu ke ", "kke") romanizedLine = romanizedLine.replace("tsu ki ", "kki") romanizedLine = romanizedLine.replace("tsu ko ", "kko") romanizedLine = romanizedLine.replace("tsu ku ", "kku") ## Collapse っ #s romanizedLine = romanizedLine.replace("tsu sa ", "ssa") romanizedLine = romanizedLine.replace("tsu se ", "sse") romanizedLine = romanizedLine.replace("tsu si ", "ssi") romanizedLine = romanizedLine.replace("tsu so ", "sso") romanizedLine = romanizedLine.replace("tsu su ", "ssu") ## Collapse っ #t romanizedLine = romanizedLine.replace("tsu ta ", "tta") romanizedLine = romanizedLine.replace("tsu te ", "tte") romanizedLine = romanizedLine.replace("tsu ti ", "tti") romanizedLine = romanizedLine.replace("tsu to ", "tto") romanizedLine = romanizedLine.replace("tsu tu ", "ttu") ## Collapse っ #p romanizedLine = romanizedLine.replace("tsu pa ", "ppa") romanizedLine = romanizedLine.replace("tsu pe ", "ppe") romanizedLine = romanizedLine.replace("tsu pi ", "ppi") romanizedLine = romanizedLine.replace("tsu po ", "ppo") ## Dangling letters romanizedLine = romanizedLine.replace(" u ", "u ") romanizedLine = romanizedLine.replace(" i ", "i ") ## Other fixes, after tsu particle romanizedLine = romanizedLine.replace(" nai ", "nai ") romanizedLine = romanizedLine.replace(" ta ", "ta ") romanizedLine = romanizedLine.replace(" te ", "te ") romanizedLine = romanizedLine.replace(" ten ", "ten ") romanizedLine = romanizedLine.replace(" ku ", "ku ") romanizedLine = romanizedLine.replace(" ba ", "ba ") romanizedLine = romanizedLine.replace(" ka ", "ka ") romanizedLine = romanizedLine.replace(" ze ", "ze ") romanizedLine = romanizedLine.replace(" ga ", "ga ") romanizedLine = romanizedLine.replace(" re ", "re ") ## Extended letters romanizedLine = romanizedLine.replace("a-", "ā") romanizedLine = romanizedLine.replace("e-", "ē") romanizedLine = romanizedLine.replace("i-", "ī") romanizedLine = romanizedLine.replace("o-", "ō") romanizedLine = romanizedLine.replace("u-", "ū") ## Special characters / Punctuation ## https://en.wikipedia.org/wiki/List_of_Japanese_typographic_symbols romanizedLine = romanizedLine.replace("「", "'") romanizedLine = romanizedLine.replace("」", "'") romanizedLine = romanizedLine.replace("『", "\"") romanizedLine = romanizedLine.replace("』", "\"") romanizedLine = romanizedLine.replace("(", "(") romanizedLine = romanizedLine.replace(")", ")") romanizedLine = romanizedLine.replace("〔", "[") romanizedLine = romanizedLine.replace("〕", "]") romanizedLine = romanizedLine.replace("[", "[") romanizedLine = romanizedLine.replace("]", "]") romanizedLine = romanizedLine.replace("{", "{") romanizedLine = romanizedLine.replace("}", "}") romanizedLine = romanizedLine.replace("⦅", "((") romanizedLine = romanizedLine.replace("⦆", "))") romanizedLine = romanizedLine.replace("〈", "‹") romanizedLine = romanizedLine.replace("〉", "›") romanizedLine = romanizedLine.replace("《", "«") romanizedLine = romanizedLine.replace("》", "»") romanizedLine = romanizedLine.replace("【", "[") romanizedLine = romanizedLine.replace("】", "]") romanizedLine = romanizedLine.replace("〖", "[") romanizedLine = romanizedLine.replace("〗", "]") romanizedLine = romanizedLine.replace("〘", "[") romanizedLine = romanizedLine.replace("〙", "]") romanizedLine = romanizedLine.replace("〚", "[") romanizedLine = romanizedLine.replace("〛", "]") romanizedLine = romanizedLine.replace("。", ".") romanizedLine = romanizedLine.replace("、", ",") romanizedLine = romanizedLine.replace("・", "·") romanizedLine = romanizedLine.replace("゠", "–") romanizedLine = romanizedLine.replace("=", "—") romanizedLine = romanizedLine.replace("…", "...") romanizedLine = romanizedLine.replace("‥", "..") ## Custom tokens and fixes romanizedLine = romanizedLine.replace("**SPACE**", " ") text = text.replace("**SPACE**", " ") ## Remove multiple spaces romanizedLine = romanizedLine.strip() romanizedLine = " ".join(romanizedLine.split()) pair[text] = romanizedLine.strip() romanized.append(pair) return romanized
def dabiaoqian(path): from pykakasi import kakasi BASE_DIRS = path # 批次 name_tezheng = 'log' # 装有特征值的那个文件的文件名 xinde = 'xinde_log' # 装入新的特征值的文件名 houzhui = '.wav.csv' # 特征值文件中除去id号之后的后缀部分 name = 'align1' # 表记着CCCCSSSS标志的文件 shibiejieguo = {} # 安放识别结果的字典 symbolcidian = {} # 这样的词典,标志词典 # id: C001L_086 # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C'] # id: C001L_087 # ['S', 'D', 'D', 'C'] # id: C001L_088 # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C'] zhengjie = {} # 正解文词典 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for per_dirs in os.listdir(BASE_DIRS): # per_dirs = C001L,C001R... d_9 = os.path.join(BASE_DIRS,per_dirs,xinde) d = os.path.join(BASE_DIRS,per_dirs,xinde) mulu.mkdir(d) zhengjie,symbolcidian = zidian.zidian(per_dirs,BASE_DIRS) #从标志文件中把标志塞进symbolcidian字典里 for id in os.listdir(os.path.join(BASE_DIRS,per_dirs,name_tezheng)):#id = C001L,C001R下面的文件的名字 banyun_1 = []#存储C的索引 banyun_2 = []#存储正确的单词 banyun_3 = []#存储非C的索引 banyun_4 = []#存储暂时不正确的单词的拼音 dianout = [] id = id.replace(houzhui, '')#把文件名中的.wav.csv去掉只剩id # print(id) # print(symbolcidian[id]) enumerate(symbolcidian[id]) banyun_1 = [i for i,x in enumerate(symbolcidian[id]) if x == 'C']#返回标志C的索引 banyun_3 = [i for i,x in enumerate(symbolcidian[id]) if x == 'S']#返回替换错误的单词的索引 t_file = os.path.join(BASE_DIRS, per_dirs, name_tezheng, id + houzhui) a = csv.reader(open(t_file, 'r', encoding='utf-8')) t_file_list = [i for i in a] # if len(banyun_1) == 0:#如果没有一个是正确的,全错,所有的数据都打标签1 # for i in range(len(t_file_list)): # t_file_list[i].insert(0, '1') # print(banyun_1) # print(banyun_3) # os.system("pause") for u in banyun_1:#banyun_1里面装的全是标志C的索引 if u+1 <= len(zhengjie[id]):#正解文单词的个数可能没有标志的个数多 # print(banyun_1) # print(zhengjie[id][u]) # print(zhengjie[id]) # print("已经把正确单词 %s 加入数组"%str(zhengjie[id][u])) banyun_2.append(zhengjie[id][u])#banyun_2是存储正确单词的索引的数组 # print("此时的banyun_2是") # print(banyun_2) # os.system('pause') else:#如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了 print("手动调一下这个文件吧%s"%id) print("它的正确单词是") print(banyun_2) os.system("pause") # print(banyun_2) # os.system('pause') for w in banyun_3:#存储非C的索引 if w + 1 <= len(zhengjie[id]): # 正解文单词的个数可能没有标志的个数多 result = conv.do(zhengjie[id][w]) banyun_4.append(result) # if result == zhengjie[id][w] and zhengjie[id][w] != '、':#如果是逗号,也按正常的单词处理 # # banyun_4.append(conv.do(_make_kana_convertor(strQ2B(zhengjie[id][w]))))#如果转化之后的值不变,就说明遇到了字母,把字母转化为半角,再再转化为片假名,之后再转化为罗马字加入列表中 # else: # # banyun_4.append(result)#存储暂时不正确的单词 # print("此时的banyun_4是") # print(banyun_4) # os.system('pause') else: # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了 print("手动调一下这个文件吧%s" % id) print("它的认识出现错误的单词是") print(banyun_4) os.system("pause") # print(banyun_2) # os.system("pause") # for p in symbolcidian[id]: # os.system("pause") # # while p == 'C': # print(p.index('C')) dir_out = os.path.join(BASE_DIRS, per_dirs, 'keka',id + '.out') dianout = pi.read_out(dir_out)#提取出来的帧号跟julius识别结果一样 # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] # [ 37 58] 0.562999 で+接続詞 [で] start = dianout.pop(0)[1][1] # print(start) for i in range(start+1): t_file_list[i].insert(0, '9')#最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for y in dianout:#dianout是识别结果跟对应的帧数表 # print("此时的单词是%s"%y) # print("此时的匹配结果是") # print(dianout) # os.system("pause") if y[1][1]+1 <= len(t_file_list):#判断这个单词的范围是否超出了特征值得总行数 if y[0] == '':#跳过前面的无音区 continue if y[0] == dianout[-1][0]:#这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,下面也有一段代码 start, end = y[1] for i in range(start, end + 1): t_file_list[i].insert(0, '9') continue if y[0] in banyun_2:#如果这个单词存在列表banyun_2中,就给这个单词对应的帧数范围打标签0 start, end = y[1] print("正在为文件 %s 的单词 %s 打标签"%(os.path.split(dir_out)[1],y[0])) for i in range(start, end+1): t_file_list[i].insert(0, '0') banyun_2.remove(y[0])#打完标签0之后再从列表中把这个单词删掉 elif conv.do(y[0]) == y[0] and y[0] != '、':#如果是字母的话,转化之后还是字母 print("发现识别结果中的字母%s"%y[0]) print("它在文件%s"%dir_out) try: zhuanhuazhi = conv.do(make_kana_convertor._make_kana_convertor(strQ2B.strQ2B(y[0]))) except: zhuanhuazhi =conv.do(make_kana_convertor._make_kana_convertor(y[0])) if zhuanhuazhi in banyun_4:#需要先把字母转化为片假名然后再转化为读音 print("转化之后的字母为%s"%zhuanhuazhi) # os.system('pause') start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove(zhuanhuazhi) # 打完标签0之后再从列表中把这个单词删掉 else: start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '1') elif conv.do(y[0]) in banyun_4: start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove(conv.do(y[0])) # 打完标签0之后再从列表中把这个单词删掉 else: start, end = y[1]#如果这个单词不在列表banyun_2中,就给这个单词对应的帧数范围打标签1 print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start , end+1): t_file_list[i].insert(0, '1') elif y[1][1]+1 > len(t_file_list): if y[0] == '': continue if y[0] == dianout[-1][0]: start = y[1][0] end = len(t_file_list) for i in range(start, end):#如果是y[1][1]+1 > len(t_file_list)的情况这里end就不能加一了 t_file_list[i].insert(0, '9') continue # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,上面也有一段代码 if y[0] in banyun_2: start = y[1][0] end = len(t_file_list)#如果这个单词的帧数表的范围超出了特征值得行数,就以特征值行数作为end print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end): t_file_list[i].insert(0, '0') banyun_2.remove(y[0]) elif conv.do(y[0]) == y[0] and y[0] != '、': # 如果是字母的话,转化之后还是字母 if conv.do(make_kana_convertor._make_kana_convertor(y[0])) in banyun_4: # 需要先把字母转化为片假名然后再转化为读音 start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove(conv.do(make_kana_convertor._make_kana_convertor(y[0]))) # 打完标签0之后再从列表中把这个单词删掉 else: start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '1') else: start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end): t_file_list[i].insert(0, '1') with open(os.path.join(BASE_DIRS, per_dirs,xinde,id+'.csv') , 'w+',encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(d_9)#把有标记9的特征值全部都删除掉
def dabiaoqian(path): from pykakasi import kakasi import csv, os name_tezheng = 'log' # 装有特征值的那个文件的文件名 xinde = 'xinde_log' # 装入新的特征值的文件名 houzhui = '.wav.csv' # 特征值文件中除去id号之后的后缀部分 name = 'align1' # 表记着CCCCSSSS标志的文件 name1 = 'align1' name2 = 'align1.txt' shibiejieguo = {} # 安放识别结果的字典 symbolcidian = {} # 这样的词典,标志词典 # id: C001L_086 # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C'] # id: C001L_087 # ['S', 'D', 'D', 'C'] # id: C001L_088 # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C'] zhengjie = {} # 正解文词典 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) biaozhiwenjian = csv.reader( open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 # biaozhiwenjian = csv.reader(open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1), 5): #这里的每一轮可以为一个语音文件打标签 ID = '' l_biaozhi = [] l_zhengjie = [] l_zhengjie_1 = [] l_jieguo = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() l_biaozhi.pop(0) # try: # ID = biaozhiwenjian_1[i].replace('id: ', '') # # l_zhengjie = biaozhiwenjian_1[i+1].split() # l_zhengjie.pop(0) # # l_jieguo = biaozhiwenjian_1[i+2].split() # l_jieguo.pop(0) # # l_biaozhi = biaozhiwenjian_1[i+3].split() # l_biaozhi.pop(0) # # except: # print(biaozhiwenjian_1[i]) # os.system("pause") #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 for i in l_biaozhi: if i == "D": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') jishuqi_zhengjie += 1 if i == "C": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 if i == "I": l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') jishuqi_jieguo += 1 if i == "S": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') dianout = pi.read_out(path_out_1) # print(dianout) # os.system('pause') path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader(open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] dimension = len(t_file_list[0]) start = dianout.pop(0)[1][1] #给开始的无音区间打标签9 for i in range(start + 1): t_file_list[i].insert(0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 zhenshubiao = {} #给每个单词都建立一个词典 for i in dianout: zhenshubiao[i[0]] = i[1] #一个.out文件中的每个单词都建立一个对应的字典 start, end = zhenshubiao['。'] #给最后面的句号的部分打上标签9 for i in range(start, end + 1): t_file_list[i].insert(0, '9') # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] # while 'D' in l_biaozhi: # l_biaozhi.remove('D') # 一次性只会删除一个D,所以要用while l_biaozhi_1 = [i for i, x in enumerate(l_biaozhi) if x == 'S'] # 返回标志S的索引 # print(len(l_biaozhi_1)) # os.system('pause') if len(l_biaozhi_1) != 0: #如果l_biaozhi_1里面没有单词,说明全部都被正确认识了 # print('l_jieguo_1') # print(l_jieguo_1) # # print('l_biaozhi_1') # print(l_biaozhi_1) # # print('l_biaozhi') # print(l_biaozhi) # # print('l_zhengjie_1') # print(l_zhengjie_1) # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) for y in l_biaozhi_1: #处理标志s对应的单词,把正解文和识别结果都转化为字母再比较一次 # print("现在输出y的值") # print(y) # # print('现在输出l_jieguo_1[y]') # print(l_jieguo_1[y]) # print(ID) # os.system('pause') #先处理识别结果 if conv.do( l_jieguo_1[y] ) == l_jieguo_1[y] and l_jieguo_1[y] != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(l_jieguo_1[y]))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( l_jieguo_1[y])) else: zhuanhuan_jieguo = conv.do(l_jieguo_1[y]) #再处理正解文 if conv.do( l_zhengjie_1[y] ) == l_zhengjie_1[y] and l_zhengjie_1[y] != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(l_zhengjie_1[y]))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( l_zhengjie_1[y])) else: zhuanhuan_zhengjie = conv.do(l_zhengjie_1[y]) # print('l_jieguo_1[y]') # print(l_jieguo_1[y]) # os.system('pause') guanjianzi = l_jieguo_1[y] #把S对应的单词取出来 # print('guanjianzi') # print(guanjianzi) # os.system('pause') # # print('zhenshubiao') # print(zhenshubiao[guanjianzi]) # os.system('pause') try: start, end = zhenshubiao[guanjianzi] #把这个单词对应的帧数范围取出来 except: print('ID') print(ID) print('zhenshubiao') print(zhenshubiao) print('guanjianzi') print(guanjianzi) os.system('pause') for i in range(start, end + 1): if zhuanhuan_jieguo == zhuanhuan_zhengjie: t_file_list[i].insert(0, '0') else: t_file_list[i].insert(0, '1') jishuqi_tezhengzhi = 0 for i in t_file_list: #给被正确识别的单词打标签0 # if i[0] != '0' and i[0] != '1' and i[0] != '9': if len(i[0]) == dimension: t_file_list[jishuqi_tezhengzhi].insert(0, '0') jishuqi_tezhengzhi += 1 path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉
import urllib.request import json from pykakasi import kakasi, wakati EMOJI_JSON_URL = 'https://raw.githubusercontent.com/yagays/emoji-ja/20190726/data/emoji_ja.json' EMOJI_DICT_PATH = 'tsv/emoji.tsv' kakasi = kakasi() kakasi.setMode("J","H") conv_j2h = kakasi.getConverter() kakasi.setMode("K","H") conv_k2h = kakasi.getConverter() def hiraganafy(keyword): k = keyword.upper() k = conv_j2h.do(k) k = conv_k2h.do(k) return k def add_word_to_dict(emoji, keyword, emoji_dict): valid_keyword = keyword.replace('ゔ', 'う゛') word = f':{valid_keyword}\t{emoji}\t記号\t' emoji_dict.append(word) class EmojiDict(): emoji_json = None emoji_dict = []