예제 #1
0
    def __init__(self, meta_file, dataset_root_dir, known_unique_speakers=[]):
        random.seed(1234)
        self.root_dir = dataset_root_dir

        # read meta-file: id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text

        self.unique_speakers = known_unique_speakers.copy()
        unique_speakers_set = set(self.unique_speakers)
        self.items = []
        with open(meta_file, 'r', encoding='utf-8') as f:
            for line in f:
                line_tokens = line[:-1].split('|')
                item = {
                    'id': line_tokens[0],
                    'speaker': line_tokens[1],
                    'language': line_tokens[2],
                    'audio': line_tokens[3],
                    'spectrogram': line_tokens[4],
                    'linear_spectrogram': line_tokens[5],
                    'text': line_tokens[6],
                    'phonemes': line_tokens[7]
                }
                #print(hp.languages)
                #print(item['language'])
                if item['language'] in hp.languages:
                    #print("uniq speakers {0} \nline_token[1] {1} \t".format(item['language'],line_tokens[1]))
                    if line_tokens[1] not in unique_speakers_set:
                        unique_speakers_set.add(line_tokens[1])
                        self.unique_speakers.append(line_tokens[1])
                        #print("item is: ", item)
                        #print("speakers: ", unique_speakers_set)
                    self.items.append(item)

        # clean text with basic stuff -- multiple spaces, case sensitivity and punctuation
        for idx in range(len(self.items)):
            item_text = self.items[idx]['text']
            item_phon = self.items[idx]['phonemes'] 
            if not hp.use_punctuation: 
                item_text = text.remove_punctuation(item_text)
                item_phon = text.remove_punctuation(item_phon)
            if not hp.case_sensitive: 
                item_text = text.to_lower(item_text)
            if hp.remove_multiple_wspaces: 
                item_text = text.remove_odd_whitespaces(item_text)
                item_phon = text.remove_odd_whitespaces(item_phon)
            self.items[idx]['text'] = item_text
            self.items[idx]['phonemes'] = item_phon

        # convert text into a sequence of character IDs, convert language and speaker names to IDs
        for idx in range(len(self.items)):
            self.items[idx]['phonemes'] = text.to_sequence(self.items[idx]['phonemes'], use_phonemes=True)
            self.items[idx]['text'] = text.to_sequence(self.items[idx]['text'], use_phonemes=False)
            self.items[idx]['speaker'] = self.unique_speakers.index(self.items[idx]['speaker'])
            self.items[idx]['language'] = hp.languages.index(self.items[idx]['language'])
def synthesize(model, input_data, force_cpu=False):

    item = input_data.split('|')
    clean_text = item[1]

    if not hp.use_punctuation:
        clean_text = text.remove_punctuation(clean_text)
    if not hp.case_sensitive:
        clean_text = text.to_lower(clean_text)
    if hp.remove_multiple_wspaces:
        clean_text = text.remove_odd_whitespaces(clean_text)

    t = torch.LongTensor(
        text.to_sequence(clean_text, use_phonemes=hp.use_phonemes))

    if hp.multi_language:
        # item[3]: l1-(len1),l2*0.75:l3*0.25-(len2),l1
        # l_tokens: list, [l1-(len1),l2*0.75:l3*0.25-(len2),l1]
        l_tokens = item[3].split(',')
        t_length = len(clean_text) + 1
        # 输出的l为一维向量,长度为language_num(language dim for every token)*token_num
        l = []
        for token in l_tokens:
            # l_d: [l2*0.75:l3*0.25,(len2)]
            l_d = token.split('-')
            # language: [0,0,...,0]
            language = [0] * hp.language_number
            for l_cw in l_d[0].split(':'):
                # l_cw: l2*0.75 / l3*0.25
                # l_cw_s: list, [l2,0.75]
                l_cw_s = l_cw.split('*')
                language[hp.languages.index(
                    l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1])

            # language: [0,0.75,0.25,...,0]
            # language_length: int, (len2). 指定该语种覆盖的长度,或者默认剩下所有的长度
            language_length = (int(l_d[1]) if len(l_d) == 2 else t_length)
            # l: list。对每一个token对应一个language: [0,0.75,0.25,...,0]
            l += [language] * language_length
            t_length -= language_length
        l = torch.FloatTensor([l])
    else:
        l = None

    # s: [int],仅有一个元素的向量
    s = torch.LongTensor([hp.unique_speakers.index(item[2])
                          ]) if hp.multi_speaker else None

    if torch.cuda.is_available() and not force_cpu:
        t = t.cuda(non_blocking=True)
        if l is not None: l = l.cuda(non_blocking=True)
        if s is not None: s = s.cuda(non_blocking=True)

    # s:仅有一个speaker_id元素的向量
    # l:元素个数为language_num*token_num的向量
    s = model.inference(t, speaker=s, language=l).cpu().detach().numpy()
    s = audio.denormalize_spectrogram(s, not hp.predict_linear)

    return s
예제 #3
0
def synthesize(model, input_data, force_cpu=False):

    item = input_data.split('|')
    print(item)
    clean_text = item[1]

    if not hp.use_punctuation:
        clean_text = text.remove_punctuation(clean_text)
    if not hp.case_sensitive:
        clean_text = text.to_lower(clean_text)
    if hp.remove_multiple_wspaces:
        clean_text = text.remove_odd_whitespaces(clean_text)

    t = torch.LongTensor(
        text.to_sequence(clean_text, use_phonemes=hp.use_phonemes))

    if hp.multi_language:
        l_tokens = item[3].split(',')
        t_length = len(clean_text) + 1
        l = []
        for token in l_tokens:
            l_d = token.split('-')

            language = [0] * hp.language_number
            for l_cw in l_d[0].split(':'):
                l_cw_s = l_cw.split('*')
                language[hp.languages.index(
                    l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1])

            language_length = (int(l_d[1]) if len(l_d) == 2 else t_length)
            l += [language] * language_length
            t_length -= language_length
        l = torch.FloatTensor([l])
    else:
        l = None

    s = torch.LongTensor([hp.unique_speakers.index(item[2])
                          ]) if hp.multi_speaker else None

    if torch.cuda.is_available() and not force_cpu:
        t = t.cuda(non_blocking=True)
        if l is not None: l = l.cuda(non_blocking=True)
        if s is not None: s = s.cuda(non_blocking=True)

    s = model.inference(t, speaker=s, language=l).cpu().detach().numpy()
    s = audio.denormalize_spectrogram(s, not hp.predict_linear)

    return s