def __init__(self, meta_file, dataset_root_dir, known_unique_speakers=[]): random.seed(1234) self.root_dir = dataset_root_dir # read meta-file: id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text self.unique_speakers = known_unique_speakers.copy() unique_speakers_set = set(self.unique_speakers) self.items = [] with open(meta_file, 'r', encoding='utf-8') as f: for line in f: line_tokens = line[:-1].split('|') item = { 'id': line_tokens[0], 'speaker': line_tokens[1], 'language': line_tokens[2], 'audio': line_tokens[3], 'spectrogram': line_tokens[4], 'linear_spectrogram': line_tokens[5], 'text': line_tokens[6], 'phonemes': line_tokens[7] } #print(hp.languages) #print(item['language']) if item['language'] in hp.languages: #print("uniq speakers {0} \nline_token[1] {1} \t".format(item['language'],line_tokens[1])) if line_tokens[1] not in unique_speakers_set: unique_speakers_set.add(line_tokens[1]) self.unique_speakers.append(line_tokens[1]) #print("item is: ", item) #print("speakers: ", unique_speakers_set) self.items.append(item) # clean text with basic stuff -- multiple spaces, case sensitivity and punctuation for idx in range(len(self.items)): item_text = self.items[idx]['text'] item_phon = self.items[idx]['phonemes'] if not hp.use_punctuation: item_text = text.remove_punctuation(item_text) item_phon = text.remove_punctuation(item_phon) if not hp.case_sensitive: item_text = text.to_lower(item_text) if hp.remove_multiple_wspaces: item_text = text.remove_odd_whitespaces(item_text) item_phon = text.remove_odd_whitespaces(item_phon) self.items[idx]['text'] = item_text self.items[idx]['phonemes'] = item_phon # convert text into a sequence of character IDs, convert language and speaker names to IDs for idx in range(len(self.items)): self.items[idx]['phonemes'] = text.to_sequence(self.items[idx]['phonemes'], use_phonemes=True) self.items[idx]['text'] = text.to_sequence(self.items[idx]['text'], use_phonemes=False) self.items[idx]['speaker'] = self.unique_speakers.index(self.items[idx]['speaker']) self.items[idx]['language'] = hp.languages.index(self.items[idx]['language'])
def synthesize(model, input_data, force_cpu=False): item = input_data.split('|') clean_text = item[1] if not hp.use_punctuation: clean_text = text.remove_punctuation(clean_text) if not hp.case_sensitive: clean_text = text.to_lower(clean_text) if hp.remove_multiple_wspaces: clean_text = text.remove_odd_whitespaces(clean_text) t = torch.LongTensor( text.to_sequence(clean_text, use_phonemes=hp.use_phonemes)) if hp.multi_language: # item[3]: l1-(len1),l2*0.75:l3*0.25-(len2),l1 # l_tokens: list, [l1-(len1),l2*0.75:l3*0.25-(len2),l1] l_tokens = item[3].split(',') t_length = len(clean_text) + 1 # 输出的l为一维向量,长度为language_num(language dim for every token)*token_num l = [] for token in l_tokens: # l_d: [l2*0.75:l3*0.25,(len2)] l_d = token.split('-') # language: [0,0,...,0] language = [0] * hp.language_number for l_cw in l_d[0].split(':'): # l_cw: l2*0.75 / l3*0.25 # l_cw_s: list, [l2,0.75] l_cw_s = l_cw.split('*') language[hp.languages.index( l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1]) # language: [0,0.75,0.25,...,0] # language_length: int, (len2). 指定该语种覆盖的长度,或者默认剩下所有的长度 language_length = (int(l_d[1]) if len(l_d) == 2 else t_length) # l: list。对每一个token对应一个language: [0,0.75,0.25,...,0] l += [language] * language_length t_length -= language_length l = torch.FloatTensor([l]) else: l = None # s: [int],仅有一个元素的向量 s = torch.LongTensor([hp.unique_speakers.index(item[2]) ]) if hp.multi_speaker else None if torch.cuda.is_available() and not force_cpu: t = t.cuda(non_blocking=True) if l is not None: l = l.cuda(non_blocking=True) if s is not None: s = s.cuda(non_blocking=True) # s:仅有一个speaker_id元素的向量 # l:元素个数为language_num*token_num的向量 s = model.inference(t, speaker=s, language=l).cpu().detach().numpy() s = audio.denormalize_spectrogram(s, not hp.predict_linear) return s
def synthesize(model, input_data, force_cpu=False): item = input_data.split('|') print(item) clean_text = item[1] if not hp.use_punctuation: clean_text = text.remove_punctuation(clean_text) if not hp.case_sensitive: clean_text = text.to_lower(clean_text) if hp.remove_multiple_wspaces: clean_text = text.remove_odd_whitespaces(clean_text) t = torch.LongTensor( text.to_sequence(clean_text, use_phonemes=hp.use_phonemes)) if hp.multi_language: l_tokens = item[3].split(',') t_length = len(clean_text) + 1 l = [] for token in l_tokens: l_d = token.split('-') language = [0] * hp.language_number for l_cw in l_d[0].split(':'): l_cw_s = l_cw.split('*') language[hp.languages.index( l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1]) language_length = (int(l_d[1]) if len(l_d) == 2 else t_length) l += [language] * language_length t_length -= language_length l = torch.FloatTensor([l]) else: l = None s = torch.LongTensor([hp.unique_speakers.index(item[2]) ]) if hp.multi_speaker else None if torch.cuda.is_available() and not force_cpu: t = t.cuda(non_blocking=True) if l is not None: l = l.cuda(non_blocking=True) if s is not None: s = s.cuda(non_blocking=True) s = model.inference(t, speaker=s, language=l).cpu().detach().numpy() s = audio.denormalize_spectrogram(s, not hp.predict_linear) return s