def read_audio(data_type, audio_paths, spk2gender, tool, config, normalize,
               save_path, global_mean_male=None, global_std_male=None,
               global_mean_female=None, global_std_female=None,
               dtype=np.float32):
    """Read HTK or WAV files.
    Args:
        data_type (string): train_si84 or train_si284 or test_dev93 or test_eval92
        audio_paths (list): paths to audio files
        spk2gender (dict):
            key => speaker
            value => gender
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & stddev over
                      the training set per gender
            speaker => normalize input features by mean & stddev per speaker
            utterance => normalize input features by mean & stddev per utterancet
                         data by mean & stddev per utterance
        save_path (string): path to save npy files
        global_mean_male (np.ndarray, optional): global mean of male over the
            training set
        global_std_male (np.ndarray, optional): global standard deviation of
            male over the training set
        global_mean_female (np.ndarray, optional): global mean of female over
            the training set
        global_std_female (np.ndarray, optional): global standard deviation of
            female over the training set
        dtype (optional): the type of data, default is np.float32
    """
    if 'train' not in data_type:
        if global_mean_male is None or global_mean_female is None:
            raise ValueError('Set mean & stddev computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')
    if tool not in ['htk', 'python_speech_features', 'librosa']:
        raise TypeError(
            'tool must be "htk" or "python_speech_features" or "librosa".')

    audio_paths_male, audio_paths_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict, speaker_std_dict = {}, {}

    # Loop 1: Computing global mean and statistics
    if 'train' in data_type and normalize != 'no':
        print('=====> Reading audio files...')
        for i, audio_path in enumerate(tqdm(audio_paths)):
            speaker = audio_path.split('/')[-2]
            utt_idx = basename(audio_path).split('.')[0]
            gender = spk2gender[speaker]

            if tool == 'htk':
                feat_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                feat_utt = w2f_psf(audio_path,
                                   feature_type=config['feature_type'],
                                   feature_dim=config['channels'],
                                   use_energy=config['energy'],
                                   use_delta1=config['delta'],
                                   use_delta2=config['deltadelta'],
                                   window=config['window'],
                                   slide=config['slide'])
            elif tool == 'librosa':
                feat_utt = w2f_librosa(audio_path,
                                       feature_type=config['feature_type'],
                                       feature_dim=config['channels'],
                                       use_energy=config['energy'],
                                       use_delta1=config['delta'],
                                       use_delta2=config['deltadelta'],
                                       window=config['window'],
                                       slide=config['slide'])

            frame_num, feat_dim = feat_utt.shape
            feat_utt_sum = np.sum(feat_utt, axis=0)

            if i == 0:
                # Initialize global statistics
                global_mean_male = np.zeros((feat_dim,), dtype=dtype)
                global_mean_female = np.zeros((feat_dim,), dtype=dtype)
                global_std_male = np.zeros((feat_dim,), dtype=dtype)
                global_std_female = np.zeros((feat_dim,), dtype=dtype)

            # For computing global mean
            if gender == 'm':
                audio_paths_male.append(audio_path)
                global_mean_male += feat_utt_sum
                total_frame_num_male += frame_num
            elif gender == 'f':
                audio_paths_female.append(audio_path)
                global_mean_female += feat_utt_sum
                total_frame_num_female += frame_num
            else:
                raise ValueError('gender is m or f.')

            # For computing speaker mean & stddev
            if normalize == 'speaker':
                # Initialize speaker statistics
                if speaker not in total_frame_num_dict.keys():
                    total_frame_num_dict[speaker] = 0
                    speaker_mean_dict[speaker] = np.zeros(
                        (feat_dim,), dtype=dtype)
                    speaker_std_dict[speaker] = np.zeros(
                        (feat_dim,), dtype=dtype)
                total_frame_num_dict[speaker] += frame_num
                speaker_mean_dict[speaker] += feat_utt_sum

        print('=====> Computing global mean & stddev...')
        # Compute global mean per gender
        global_mean_male /= total_frame_num_male
        global_mean_female /= total_frame_num_female

        # Compute speaker mean
        if normalize == 'speaker':
            for speaker in speaker_mean_dict.keys():
                speaker_mean_dict[speaker] /= total_frame_num_dict[speaker]

        for audio_path in tqdm(audio_paths):
            speaker = audio_path.split('/')[-2]
            utt_idx = basename(audio_path).split('.')[0]
            gender = spk2gender[speaker]

            if tool == 'htk':
                feat_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                feat_utt = w2f_psf(audio_path,
                                   feature_type=config['feature_type'],
                                   feature_dim=config['channels'],
                                   use_energy=config['energy'],
                                   use_delta1=config['delta'],
                                   use_delta2=config['deltadelta'],
                                   window=config['window'],
                                   slide=config['slide'])
            elif tool == 'librosa':
                feat_utt = w2f_librosa(audio_path,
                                       feature_type=config['feature_type'],
                                       feature_dim=config['channels'],
                                       use_energy=config['energy'],
                                       use_delta1=config['delta'],
                                       use_delta2=config['deltadelta'],
                                       window=config['window'],
                                       slide=config['slide'])

            # For computing global stddev
            if gender == 'm':
                global_std_male += np.sum(
                    np.abs(feat_utt - global_mean_male) ** 2, axis=0)
            elif gender == 'f':
                global_std_female += np.sum(
                    np.abs(feat_utt - global_mean_female) ** 2, axis=0)
            else:
                raise ValueError('gender is m or f.')

            # For computing speaker stddev
            if normalize == 'speaker':
                speaker_std_dict[speaker] += np.sum(
                    np.abs(feat_utt - speaker_mean_dict[speaker]) ** 2, axis=0)

        # Compute speaker stddev
        if normalize == 'speaker':
            for speaker in speaker_std_dict.keys():
                speaker_std_dict[speaker] = np.sqrt(
                    speaker_std_dict[speaker] / (total_frame_num_dict[speaker] - 1))

        # Compute global stddev per gender
        global_std_male = np.sqrt(
            global_std_male / (total_frame_num_male - 1))
        global_std_female = np.sqrt(
            global_std_female / (total_frame_num_female - 1))

        # Save global mean & stddev per gender
        np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
        np.save(join(save_path, 'global_mean_female.npy'),
                global_mean_female)
        np.save(join(save_path, 'global_std_male.npy'), global_std_male)
        np.save(join(save_path, 'global_std_female.npy'), global_std_female)

    # Loop 2: Normalization and saving
    print('=====> Normalization...')
    frame_num_dict = {}
    # sampPeriod, parmKind = None, None
    for audio_path in tqdm(audio_paths):
        speaker = audio_path.split('/')[-2]
        utt_idx = basename(audio_path).split('.')[0]
        gender = spk2gender[speaker]

        if tool == 'htk':
            feat_utt, sampPeriod, parmKind = read(audio_path)
        elif tool == 'python_speech_features':
            feat_utt = w2f_psf(audio_path,
                               feature_type=config['feature_type'],
                               feature_dim=config['channels'],
                               use_energy=config['energy'],
                               use_delta1=config['delta'],
                               use_delta2=config['deltadelta'],
                               window=config['window'],
                               slide=config['slide'])
        elif tool == 'librosa':
            feat_utt = w2f_librosa(audio_path,
                                   feature_type=config['feature_type'],
                                   feature_dim=config['channels'],
                                   use_energy=config['energy'],
                                   use_delta1=config['delta'],
                                   use_delta2=config['deltadelta'],
                                   window=config['window'],
                                   slide=config['slide'])

        if normalize == 'no':
            pass
        elif normalize == 'global' or 'train' not in data_type:
            # Normalize by mean & stddev over the training set per gender
            if gender == 'm':
                feat_utt -= global_mean_male
                feat_utt /= global_std_male
            elif gender == 'f':
                feat_utt -= global_mean_female
                feat_utt /= global_std_female
            else:
                raise ValueError('gender is m or f.')
        elif normalize == 'speaker':
            # Normalize by mean & stddev per speaker
            feat_utt = (
                feat_utt - speaker_mean_dict[speaker]) / speaker_std_dict[speaker]
        elif normalize == 'utterance':
            # Normalize by mean & stddev per utterance
            utt_mean = np.mean(feat_utt, axis=0, dtype=dtype)
            utt_std = np.std(feat_utt, axis=0, dtype=dtype)
            feat_utt = (feat_utt - utt_mean) / utt_std

        frame_num_dict[utt_idx] = feat_utt.shape[0]

        # Save input features
        np.save(mkdir_join(save_path, speaker, utt_idx + '.npy'), feat_utt)

    # Save the frame number dictionary
    with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
        pickle.dump(frame_num_dict, f)
Пример #2
0
def segment(audio_path,
            speaker,
            utt_dict,
            is_training,
            sil_duration=0.,
            tool='htk',
            config=None,
            mean=None,
            dtype=np.float32):
    """Segment each HTK or WAV file into utterances. Normalization will not be
       conducted here.
    Args:
        audio_path (string): path to a HTK or WAV file
        speaker (string): speaker name
        utt_dict (dict): dictionary of utterance information
            key (string) => utterance index
            value (list) => [start_frame, end_frame]
        sil_duration (float): duration of silence at both ends. Default is 0.
        tool (string): htk or python_speech_features or librosa
        config (dict): a configuration for feature extraction
        mean (np.ndarray):  A mean vector over the file
        dtype (optional): default is np.float64
    Returns:
        feat_dict (dict):
            key (string) => utt_idx
            value (np.ndarray )=> a feature vector of size
                `(frame_num, feature_dim)`
        feat_utt_sum (np.ndarray): A sum of feature vectors of a speaker
        mean (np.ndarray): A mean vector over the file
        stddev (np.ndarray): A stddev vector over the file
        total_frame_num_file (int): total frame num of the target speaker's utterances
    """
    if tool != 'htk' and config is None:
        raise ValueError('Set config dict.')

    # Read the HTK or WAV file
    if tool == 'htk':
        feat, _, _ = read_htk(audio_path)
    elif tool == 'python_speech_features':
        feat = w2f_psf(audio_path,
                       feature_type=config['feature_type'],
                       feature_dim=config['channels'],
                       use_energy=config['energy'],
                       use_delta1=config['delta'],
                       use_delta2=config['deltadelta'],
                       window=config['window'],
                       slide=config['slide'])
    elif tool == 'librosa':
        feat = w2f_librosa(audio_path,
                           feature_type=config['feature_type'],
                           feature_dim=config['channels'],
                           use_energy=config['energy'],
                           use_delta1=config['delta'],
                           use_delta2=config['deltadelta'],
                           window=config['window'],
                           slide=config['slide'])

    assert isinstance(utt_dict, OrderedDict)
    # NOTE: utt_dict must be an instance of OrderedDict

    # Divide into each utterance
    feat_dim = feat.shape[1]
    feat_dict = {}
    total_frame_num_file = 0
    end_frame_pre = 0
    utt_num = len(utt_dict.keys())
    feat_utt_sum = np.zeros((feat_dim, ), dtype=dtype)
    stddev = np.zeros((feat_dim, ), dtype=dtype)
    # keys = sorted(list(utt_dict.keys()))
    keys = list(utt_dict.keys())
    for i, utt_idx in enumerate(keys):
        utt_info = utt_dict[utt_idx]
        start_frame, end_frame = utt_info[0], utt_info[1]

        # Check timestamp
        if start_frame > end_frame:
            print(utt_dict)
            print('Warning: time stamp is reversed.')
            print('speaker index: %s' % speaker)
            print('utterance index: %s' % utt_idx)
            print('start_frame: %.3f' % start_frame)
            print('end_frame: %.3f' % end_frame)
            raise ValueError

        # Check the first utterance
        if i == 0:
            if start_frame >= sil_duration:
                start_frame_extend = start_frame - sil_duration
            else:
                start_frame_extend = 0

            if len(utt_dict) != 1:
                start_frame_next = utt_dict[keys[i + 1]][0]
                if end_frame > start_frame_next:
                    print('Warning: utterances are overlapping.')
                    print('speaker index: %s' % speaker)
                    print('utterance index: %s' % utt_idx)
                    print('end_frame: %.3f' % end_frame)
                    print('start_frame_next: %.3f' % start_frame_next)

                if start_frame_next - end_frame >= sil_duration * 2:
                    end_frame_extend = end_frame + sil_duration
                else:
                    end_frame_extend = end_frame + \
                        int((start_frame_next - end_frame) / 2)
            else:
                end_frame_extend = end_frame + sil_duration
                # end_frame_extend = end_frame

        # Check the last utterance
        elif i == utt_num - 1:
            if start_frame - end_frame_pre >= sil_duration * 2:
                start_frame_extend = start_frame - sil_duration
            else:
                start_frame_extend = start_frame - \
                    int((start_frame - end_frame_pre) / 2)

            if feat.shape[0] - end_frame >= sil_duration:
                end_frame_extend = end_frame + sil_duration
            else:
                end_frame_extend = feat.shape[0]  # last frame

        # Check other utterances
        else:
            if start_frame - end_frame_pre >= sil_duration * 2:
                start_frame_extend = start_frame - sil_duration
            else:
                start_frame_extend = start_frame - \
                    int((start_frame - end_frame_pre) / 2)

            start_frame_next = utt_dict[keys[i + 1]][0]
            if end_frame > start_frame_next:
                print('Warning: utterances are overlapping.')
                print('speaker index: %s' % speaker)
                print('utterance index: %s' % utt_idx)
                print('end_frame: %.3f' % end_frame)
                print('start_frame_next: %.3f' % start_frame_next)

            if start_frame_next - end_frame >= sil_duration * 2:
                end_frame_extend = end_frame + sil_duration
            else:
                end_frame_extend = end_frame + \
                    int((start_frame_next - end_frame) / 2)

        feat_utt = feat[start_frame_extend:end_frame_extend]
        feat_utt_sum += np.sum(feat_utt, axis=0)
        total_frame_num_file += (end_frame_extend - start_frame_extend)
        feat_dict[str(utt_idx)] = feat_utt

        # For computing stddev over the file
        if mean is not None:
            stddev += np.sum(np.abs(feat_utt - mean)**2, axis=0)

        # Update
        end_frame_pre = end_frame

    if is_training:
        if mean is not None:
            # Compute stddev over the file
            stddev = np.sqrt(stddev / (total_frame_num_file - 1))
        else:
            # Compute mean over the file
            mean = feat_utt_sum / total_frame_num_file
            stddev = None
    else:
        mean, stddev = None, None

    return feat_dict, feat_utt_sum, mean, stddev, total_frame_num_file