示例#1
0
 def _load_and_reduce_spectrograms_and_durations(fpath, duration):
     fname, mel, mag, random_start_position = _load_and_reduce_spectrograms(
         fpath)
     duration_matrix = durations_to_hard_attention_matrix(duration)
     duration_matrix = end_pad_for_reduction_shape_sync(
         duration_matrix, hp)
     duration_matrix = duration_matrix[
         random_start_position::hp.r, :]
     return fname, mel, mag, duration_matrix, random_start_position
示例#2
0
def load_data(hp, mode="train", audio_extension='.wav'):
    '''Loads data
      Args:
          mode: "train" / "validation" / "synthesis" / "demo".
    '''
    assert mode in ('train', 'synthesis', 'validation', 'demo')
    logging.info('Start loading data in mode: %s' % (mode))
    get_speaker_codes = (hp.multispeaker != []
                         )  ## False if hp.multispeaker is empty list
    #import pdb;pdb.set_trace()
    dataset_df_path = os.path.join(hp.featuredir, 'dataset_' + mode + '.csv')

    # In demo mode, we change the "dataset" with only one line each time and do not want to use always the same df
    #if os.path.exists(dataset_df_path) and mode != 'demo':
    if 0:
        dataset_df = pd.read_csv(dataset_df_path)

        dataset = {}
        #import pdb;pdb.set_trace()

        # this does not work in train mode because of  problem with doing pd.eval() with bytes
        try:
            dataset['texts'] = np.array(
                [pd.eval(e) for e in dataset_df['texts'].tolist()])
        except AttributeError:
            #that is why we do this
            dataset['texts'] = np.array(
                [ast.literal_eval(e) for e in dataset_df['texts'].tolist()])
            # I think this cause an error when trying training:
            # tensorflow.python.framework.errors_impl.InvalidArgumentError: Input to DecodeRaw has length 105 that is not a multiple of 4, the size of int32

        dataset['fpaths'] = dataset_df['fpaths'].tolist(
        )  ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist
        dataset['text_lengths'] = dataset_df['text_lengths'].tolist(
        )  ## only used in training (where length information lost due to string format) - TODO: good motivation for this format?
        dataset['audio_lengths'] = dataset_df['audio_lengths'].tolist(
        )  ## might be []
        dataset['label_lengths'] = dataset_df['label_lengths'].tolist(
        )  ## might be []

        if get_speaker_codes:
            dataset['speakers'] = dataset_df['speakers'].tolist()
        if hp.use_external_durations:
            dataset['durations'] = dataset_df['durations'].tolist()

    else:
        if mode in ['synthesis', 'demo']:
            get_speaker_codes = False  ## never read speaker from transcript for synthesis -- take user-specified speaker instead

        # Load vocabulary
        char2idx, idx2char = load_vocab(hp)

        if mode in ["train", "validation"]:
            transcript = os.path.join(hp.transcript)
        elif mode == 'synthesis':
            transcript = os.path.join(hp.test_transcript)
        else:
            transcript = './demo/transcript.csv'

        if hp.multispeaker:
            speaker2ix = dict(zip(hp.speaker_list,
                                  range(len(hp.speaker_list))))

        fpaths, text_lengths, texts, speakers, durations = [], [], [], [], []
        audio_lengths, label_lengths = [], []
        lines = codecs.open(transcript, 'r', 'utf-8').readlines()

        too_long_count_frames = 0
        too_long_count_text = 0
        no_data_count = 0

        nframes = 0  ## default 'False' value
        for line in tqdm(lines, desc='load_data'):
            line = line.strip('\n\r |')
            if line == '':
                continue
            fields = line.strip().split("|")

            assert len(fields) >= 1, fields
            if len(fields) > 1:
                assert len(fields) >= 3, fields

            fname = fields[0]
            if len(fields) > 1:
                unnorm_text, norm_text = fields[1:3]
            else:
                norm_text = None  # to test if audio only

            if hp.validpatt:
                if mode == "train":
                    if hp.validpatt in fname:
                        continue
                elif mode == "validation":
                    if hp.validpatt not in fname:
                        continue

            if len(fields) >= 4:
                phones = fields[3]

            if norm_text is None:
                letters_or_phones = [
                ]  #  [0] ## dummy 'text' (1 character of padding) where we are using audio only
            elif hp.input_type == 'phones':
                if 'speaker_dependent_phones' in hp.multispeaker:
                    speaker_code = speaker
                else:
                    speaker_code = ''
                phones = phones_normalize(
                    phones, char2idx, speaker_code=speaker_code
                )  # in case of phones, all EOS markers are assumed included
                letters_or_phones = [char2idx[char] for char in phones]
            elif hp.input_type == 'letters':
                text = text_normalize(norm_text, hp) + "E"  # E: EOS
                letters_or_phones = [char2idx[char] for char in text]

            text_length = len(letters_or_phones)

            if text_length > hp.max_N:
                #print('number of letters/phones for %s is %s, exceeds max_N %s: skip it'%(fname, text_length, hp.max_N))
                too_long_count_text += 1
                continue

            if mode in ["train", "validation"] and os.path.exists(
                    hp.coarse_audio_dir):
                mel = "{}/{}".format(hp.coarse_audio_dir, fname + ".npy")
                if not os.path.exists(mel):
                    logging.debug('no file %s' % (mel))
                    no_data_count += 1
                    continue
                nframes = np.load(mel).shape[0]
                if nframes > hp.max_T:
                    #print('number of frames for %s is %s, exceeds max_T %s: skip it'%(fname, nframes, hp.max_T))
                    too_long_count_frames += 1
                    continue
                audio_lengths.append(nframes)

            texts.append(np.array(letters_or_phones, np.int32))

            fpath = os.path.join(hp.waveforms, fname + audio_extension)
            fpaths.append(fpath)
            text_lengths.append(text_length)

            ## get speaker before phones in case need to get speaker-dependent phones
            if get_speaker_codes:
                assert len(fields) >= 5, fields
                speaker = fields[4]
                speaker_ix = speaker2ix[speaker]
                speakers.append(np.array(speaker_ix, np.int32))

            if hp.merlin_label_dir:  ## only get shape here -- get the data later
                try:
                    label_length, label_dim = np.load("{}/{}".format(
                        hp.merlin_label_dir,
                        basename(fpath) + ".npy")).shape
                except TypeError:
                    label_length, label_dim = np.load("{}/{}".format(
                        hp.merlin_label_dir,
                        basename(fpath.decode('utf-8')) + ".npy")).shape
                label_lengths.append(label_length)
                assert label_dim == hp.merlin_lab_dim

            if hp.use_external_durations:
                assert len(fields) >= 6, fields
                duration_data = fields[5]
                duration_data = [
                    int(value)
                    for value in re.split('\s+', duration_data.strip(' '))
                ]
                duration_data = np.array(duration_data, np.int32)
                if hp.merlin_label_dir:
                    duration_data = duration_data[
                        duration_data >
                        0]  ## merlin label contains no skipped items
                    assert len(duration_data) == label_length, (
                        len(duration_data), label_length, fpath)
                else:
                    assert len(duration_data) == text_length, (
                        len(duration_data), text_length, fpath)
                if nframes:
                    assert duration_data.sum() == nframes * hp.r, (
                        duration_data.sum(), nframes * hp.r)
                durations.append(duration_data)

            # !TODO! check this -- duplicated!?
            # if hp.merlin_label_dir: ## only get shape here -- get the data later
            #     label_length, _ = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")).shape
            #     label_lengths.append(label_length)

        #import pdb;pdb.set_trace()

        if mode == "validation":
            if len(texts) == 0:
                logging.error(
                    'No validation sentences collected: maybe the validpatt %s matches no training data file names?'
                    % (hp.validpatt))
                sys.exit(1)

        logging.info('Loaded data for %s sentences' % (len(texts)))
        logging.info('Sentences skipped with missing features: %s' %
                     (no_data_count))
        logging.info('Sentences skipped with > max_T (%s) frames: %s' %
                     (hp.max_T, too_long_count_frames))
        logging.info(
            'Additional sentences skipped with > max_N (%s) letters/phones: %s'
            % (hp.max_N, too_long_count_text))

        if mode == 'train' and hp.n_utts > 0:
            n_utts = hp.n_utts
            assert n_utts <= len(fpaths)
            logging.info('Take first %s (n_utts) sentences for training' %
                         (n_utts))
            fpaths = fpaths[:n_utts]
            text_lengths = text_lengths[:n_utts]
            texts = texts[:n_utts]
            if get_speaker_codes:
                speakers = speakers[:n_utts]
            if audio_lengths:
                audio_lengths = audio_lengths[:n_utts]
            if label_lengths:
                label_lengths = label_lengths[:n_utts]

        if mode == 'train':
            ## Return string representation which will be parsed with tf's decode_raw:
            texts = [text.tostring() for text in texts]
            if get_speaker_codes:
                speakers = [speaker.tostring() for speaker in speakers]
            if hp.use_external_durations:
                durations = [d.tostring() for d in durations]

        if mode in ['validation', 'synthesis', 'demo']:
            ## Prepare a batch of 'stacked texts' (matrix with number of rows==synthesis batch size, and each row an array of integers)
            stacked_texts = np.zeros((len(texts), hp.max_N), np.int32)
            for i, text in enumerate(texts):
                stacked_texts[i, :len(text)] = text
            texts = stacked_texts

            if hp.use_external_durations:
                stacked_durations = np.zeros((len(texts), hp.max_T, hp.max_N),
                                             np.int32)
                for i, dur in enumerate(durations):
                    duration_matrix = durations_to_hard_attention_matrix(dur)
                    duration_matrix = end_pad_for_reduction_shape_sync(
                        duration_matrix, hp)
                    duration_matrix = duration_matrix[0::hp.r, :]
                    m, n = duration_matrix.shape
                    stacked_durations[i, :m, :n] = duration_matrix
                durations = stacked_durations

        dataset = {}
        dataset['texts'] = texts
        dataset[
            'fpaths'] = fpaths  ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist
        dataset[
            'text_lengths'] = text_lengths  ## only used in training (where length information lost due to string format) - TODO: good motivation for this format?
        dataset['audio_lengths'] = audio_lengths  ## might be []
        dataset['label_lengths'] = label_lengths  ## might be []

        dataset_df = dataset.copy()

        try:
            dataset_df['texts'] = dataset_df['texts'].tolist()
        except:
            # It is already a list
            pass
        try:
            if len(dataset_df['audio_lengths']) == 0:
                dataset_df['audio_lengths'] = [0] * len(dataset_df['texts'])
            if len(dataset_df['label_lengths']) == 0:
                dataset_df['label_lengths'] = [0] * len(dataset_df['texts'])
            if not os.path.exists(hp.featuredir): os.makedirs(hp.featuredir)
            pd.DataFrame.to_csv(pd.DataFrame.from_records(dataset_df),
                                dataset_df_path)
        except:
            import pdb
            pdb.set_trace()

        if get_speaker_codes:
            dataset['speakers'] = speakers
        if hp.use_external_durations:
            dataset['durations'] = durations

    logging.info('Finished loading data in mode: %s' % (mode))
    #import pdb;pdb.set_trace()
    return dataset