Пример #1
0
    def process(self, features):
        """Computes voice activity detection (VAD) on the input `features`

        Parameters
        ----------
        features : :class:`~shennong.features.Features`, shape = [n,m]
            The speech features on which to look for voiced
            frames. The first coefficient must be a log-energy (or
            equivalent). Works well with
            :class:`~shennong.processor.mfcc.MfccProcessor` and
            :class:`~shennong.processor.plp.PlpProcessor`.

        Returns
        -------
        vad : :class:`~shennong.features.Features`, shape = [n,1]
            The output vad features are of dtype uint8 and contain 1
            for voiced frames or 0 for unvoiced frames.

        """
        data = kaldi.matrix.SubVector(
            kaldi.ivector.compute_vad_energy(
                self._options, kaldi.matrix.SubMatrix(features.data))).numpy()

        return Features(np.atleast_2d(data.astype(np.uint8)).T,
                        features.times,
                        properties=self.get_properties(features))
Пример #2
0
    def trim(self, vad):
        """Returns a new instance of FeaturesCollection where each features
        has been trimmed with the corresponding VAD.

        Parameters
        ----------
        vad : dict of boolean ndarrays
            A dictionnary of arrays indicating which frame to keep.

        Returns
        -------
        features: FeaturesCollection
            A new FeaturesCollection trimmed with the input VAD

        Raises
        ------
        ValueError
            If the utterances are not the same. If the VAD arrays are
            not boolean arrays.
        """
        if vad.keys() != self.keys():
            raise ValueError('Vad keys are different from this keys.')

        for key in vad.keys():
            if vad[key].dtype != np.dtype('bool'):
                raise ValueError('Vad arrays must be arrays of bool.')
            if vad[key].shape[0] != self[key].nframes:
                raise ValueError(
                    'Vad arrays length must be equal to the number of frames.')

        return FeaturesCollection({
            k: Features(
                self[k].data[vad[k]],
                self[k].times[vad[k]],
                properties=self[k].properties) for k in self.keys()})
Пример #3
0
    def _process(self, cls, signal, vtln_warp):
        """Inner process method common to all Kaldi Mel processors"""
        # ensure the signal is correct
        if signal.nchannels != 1:
            raise ValueError(
                'signal must have one dimension, but it has {}'.format(
                    signal.nchannels))

        if self.sample_rate != signal.sample_rate:
            raise ValueError('processor and signal mismatch in sample rates: '
                             '{} != {}'.format(self.sample_rate,
                                               signal.sample_rate))

        # we need to forward options (because the assignation here is
        # done by copy, not by reference. If the user do 'p =
        # Processor(); p.dither = 0', this is forwarded to Kaldi here)
        self._options.frame_opts = self._frame_options
        self._options.mel_opts = self._mel_options

        # force 16 bits integers
        signal = signal.astype(np.int16).data
        data = kaldi.matrix.SubMatrix(
            cls(self._options).compute(kaldi.matrix.SubVector(signal),
                                       vtln_warp)).numpy()

        return Features(data,
                        self.times(data.shape[0]),
                        properties=self.get_properties(vtln_warp=vtln_warp))
Пример #4
0
 def process_one(self, wav):
     # frame shift of 25.6ms to be online with ground truth
     audio = amfm_decompy.basic_tools.SignalObj(wav)
     pitch = amfm_decompy.pYAAPT.yaapt(
         audio, frame_length=25.6, frame_space=10)
     return Features(
         np.atleast_2d(pitch.samp_values).T,
         pitch.frames_pos / audio.fs)
Пример #5
0
    def _load(self):
        self._log.info('loading %s', self.filename)

        data = np.load(open(self.filename, 'rb'),
                       allow_pickle=True)['features'].tolist()

        features = self._features_collection()
        for k, v in data.items():
            features[k] = Features._from_dict(v, validate=False)
        return features
Пример #6
0
    def _load(self):
        # loading properties
        filename = self._fileroot + '.properties.json'
        self._log.info('loading %s', filename)
        if not os.path.isfile(filename):
            raise IOError('file not found: {}'.format(filename))

        properties = json_tricks.loads(open(filename, 'r').read())

        # loading times
        ark = self._fileroot + '.times.ark'
        self._log.info('loading %s', ark)
        if not os.path.isfile(ark):
            raise IOError('file not found: {}'.format(ark))

        rspecifier = 'ark:' + ark
        with kaldi.util.table.SequentialDoubleMatrixReader(
                rspecifier) as reader:
            times = {k: v.numpy() for k, v in reader}

        # postprocess times: do 2d->1d if they are 1d vectors
        for key, value in times.items():
            if value.shape[0] == 1:
                times[key] = value.reshape((value.shape[1]))

        # loading features
        ark = self._fileroot + '.ark'
        self._log.info('loading %s', ark)

        # rspecifier = 'ark,scp:' + ark + ',' + scp
        rspecifier = 'ark:' + ark
        with kaldi.util.table.SequentialDoubleMatrixReader(
                rspecifier) as reader:
            data = {k: v.numpy() for k, v in reader}

        if properties.keys() != data.keys():
            raise ValueError(
                'invalid features: items differ in data and properties')

        if times.keys() != data.keys():
            raise ValueError(
                'invalid features: items differ in data and times')

        return self._features_collection(
            **{
                k: Features(data[k].astype(properties[k]['__dtype_data__']),
                            times[k].astype(properties[k]['__dtype_times__']),
                            properties={
                                k: p
                                for k, p in properties[k].items()
                                if '__dtype_' not in k
                            },
                            validate=False)
                for k in data.keys()
            })
Пример #7
0
    def process(self, signal):
        """Compute spectrogram with the specified options

        Parameters
        ----------
        signal : Audio, shape = [nsamples, 1]
            The input audio signal to compute the features on, must be
            mono

        Returns
        -------
        features : `Features`, shape = [nframes, `ndims`]
            The computed features, output will have as many rows as there
            are frames (depends on the specified options `frame_shift`
            and `frame_length`).

        Raises
        ------
        ValueError
            If the input `signal` has more than one channel (i.e. is
            not mono). If `sample_rate` != `signal.sample_rate`.

        """
        # whereas Kaldi (and so pykaldi) exposes a vtln_warp parameter for
        # spectrograms, it is only present for compatibility and has no effect.
        # See https://github.com/kaldi-asr/kaldi/blob
        # /598ad3a400a70b934485f577354b19ee04dd8636/src/feat/feature-spectrogram.h#L97.
        # So this parameter is not exposed in shennong and we forward a
        # "neutral" (1.0) VTLN warp to pykaldi.

        # ensure the signal is correct
        if signal.nchannels != 1:
            raise ValueError(
                'signal must have one dimension, but it has {}'.format(
                    signal.nchannels))

        if self.sample_rate != signal.sample_rate:
            raise ValueError('processor and signal mismatch in sample rates: '
                             '{} != {}'.format(self.sample_rate,
                                               signal.sample_rate))

        # we need to forward options (because the assignation here is
        # done by copy, not by reference. If the user do 'p =
        # Processor(); p.dither = 0', this is forwarded to Kaldi here)
        self._options.frame_opts = self._frame_options

        # force 16 bits integers
        signal = signal.astype(np.int16).data
        data = kaldi.matrix.SubMatrix(
            kaldi.feat.spectrogram.Spectrogram(self._options).compute(
                kaldi.matrix.SubVector(signal), 1.0)).numpy()

        return Features(data,
                        self.times(data.shape[0]),
                        properties=self.get_properties())
Пример #8
0
    def process_one(self, wav):
        audio = Audio.load(wav)
        raw = parselmouth.Sound(
            audio.data, sampling_frequency=audio.sample_rate).to_pitch()
        times = self.frames.times(audio.nsamples)

        # linear interpolation of Praat pitch to be on same timestamps
        # thant ground truth and other models
        pitch = np.atleast_2d(np.nan_to_num(np.asarray(
            [raw.get_value_at_time(t) for t in times.mean(axis=1)]))).T
        return Features(pitch, times)
Пример #9
0
    def process(self, signal, vtln_warp=1.0):
        """Compute Rasta-PLP features with the specified options

        Do an optional feature-level vocal tract length normalization
        (VTLN) when `vtln_warp` != 1.0.

        Parameters
        ----------
        signal : Audio, shape = [nsamples, 1]
            The input audio signal to compute the features on, must be
            mono
        vtln_warp : float, optional
            The VTLN warping factor to be applied when computing
            features. Be 1.0 by default, meaning no warping is to be
            done.

        Returns
        -------
        features : `Features`, shape = [nframes, `ndims`]
            The computed features, output will have as many rows as there
            are frames (depends on the specified options `frame_shift`
            and `frame_length`).

        Raises
        ------
        ValueError
            If the input `signal` has more than one channel (i.e. is
            not mono). If `sample_rate` != `signal.sample_rate`.

        """
        # ensure the signal is correct
        if signal.nchannels != 1:
            raise ValueError(
                'signal must have one dimension, but it has {}'.format(
                    signal.nchannels))

        if self.sample_rate != signal.sample_rate:
            raise ValueError('processor and signal mismatch in sample rates: '
                             '{} != {}'.format(self.sample_rate,
                                               signal.sample_rate))

        # extract the PLP features
        self._reset_buffers()
        data = self._compute(signal, vtln_warp)

        return Features(data,
                        self.times(data.shape[0]),
                        properties=self.get_properties(vtln_warp=vtln_warp),
                        validate=False)
Пример #10
0
    def _load(self):
        self._log.info('loading %s', self.filename)

        data = h5features.Reader(self.filename, groupname='features').read()

        features = self._features_collection()
        for n in range(len(data.items())):
            features[data.items()[n]] = Features(
                data.features()[n],
                data.labels()[n],
                properties=(data.properties()[n]
                            if data.has_properties() else {}),
                validate=False)

        return features
Пример #11
0
    def _load(self):
        self._log.info('loading %s', self.filename)

        data = self._check_keys(
            scipy.io.loadmat(self.filename,
                             appendmat=False,
                             squeeze_me=True,
                             mat_dtype=True,
                             struct_as_record=False))

        features = self._features_collection()
        for k, v in data.items():
            if k not in ('__header__', '__version__', '__globals__'):
                if 'properties' in v:
                    features[k] = Features(
                        v['data'],
                        v['times'],
                        self._make_list(self._check_keys(v['properties'])),
                        validate=False)
                else:
                    features[k] = Features(v['data'],
                                           v['times'],
                                           validate=False)
        return features
Пример #12
0
    def process(self, raw_pitch):
        """Post process a raw pitch data as specified by the options

        Parameters
        ----------
        raw_pitch : Features, shape = [n, 2]
            The pitch as extracted by the `KaldiPitchProcessor.process` method

        Returns
        -------
        pitch : Features, shape = [n, 1 2 3 or 4]
            The post-processed pitch usable as speech features. The
            output columns are 'pov_feature', 'normalized_log_pitch',
            delta_pitch' and 'raw_log_pitch', in that order,if their
            respective options are set to True.

        Raises
        ------
        ValueError
            If `raw_pitch` has not exactly two columns. If all the
            following options are False: 'add_pov_feature',
            'add_normalized_log_pitch', 'add_delta_pitch' and
            'add_raw_log_pitch' (at least one of them must be True).

        """
        # check at least one required option is True
        if not (self.add_pov_feature or self.add_normalized_log_pitch
                or self.add_delta_pitch or self.add_raw_log_pitch):
            raise ValueError(
                'at least one of the following options must be True: '
                'add_pov_feature, add_normalized_log_pitch, '
                'add_delta_pitch, add_raw_log_pitch')

        if raw_pitch.shape[1] != 2:
            raise ValueError(
                'data shape must be (_, 2), but it is (_, {})'.format(
                    raw_pitch.shape[1]))

        data = kaldi.matrix.SubMatrix(
            kaldi.feat.pitch.process_pitch(
                self._options,
                kaldi.matrix.SubMatrix(raw_pitch.data))).numpy()

        return Features(data,
                        raw_pitch.times,
                        properties=self.get_properties(raw_pitch))
Пример #13
0
    def process(self, signal):
        """Extracts the (NCCF, pitch) from a given speech `signal`

        Parameters
        ----------
        signal : Audio
            The speech signal on which to estimate the pitch. The
            signal's sample rate must match the sample rate specified
            in the `PitchProcessor` options.

        Returns
        -------
        raw_pitch_features : Features, shape = [nframes, 2]
            The output array has as many rows as there are frames
            (depends on the specified options `frame_shift` and
            `frame_length`), and two columns corresponding to (NCCF,
            pitch).

        Raises
        ------
        ValueError
            If the input `signal` has more than one channel (i.e. is
            not mono). If `sample_rate` != `signal.sample_rate`.

        """
        if signal.nchannels != 1:
            raise ValueError(
                'audio signal must have one channel, but it has {}'.format(
                    signal.nchannels))

        if self.sample_rate != signal.sample_rate:
            raise ValueError('processor and signal mismatch in sample rates: '
                             '{} != {}'.format(self.sample_rate,
                                               signal.sample_rate))

        # force 16 bits integers
        signal = signal.astype(np.int16).data
        data = kaldi.matrix.SubMatrix(
            kaldi.feat.pitch.compute_kaldi_pitch(
                self._options, kaldi.matrix.SubVector(signal))).numpy()

        return Features(data,
                        self.times(data.shape[0]),
                        properties=self.get_properties())
Пример #14
0
def prepare_ground_truth(data_directory):
    """Retrieves pitch ground truth for evaluations"""
    output_file = data_directory / 'pitch' / 'ground_truth.h5f'
    output_file.parent.mkdir(parents=True, exist_ok=True)
    if output_file.is_file():
        return

    print('retrieving pitch ground truth...')
    truth = FeaturesCollection()
    for pitch in data_directory.glob('raw/KEELE/**/pitch.npy'):
        data = np.load(pitch)
        truth[pitch.parent.stem[:2]] = Features(
            np.atleast_2d(data['pitch']).T,
            # from https://lost-contact.mit.edu/afs/nada.kth.se/dept/tmh/
            # corpora/KeelePitchDB/Speech/keele_pitch_database.htm we have
            # pitch computed on 10ms steps over 25.6ms windows. Here we shift
            # time from frame beginning to middle time.
            data['time'] + 0.0128)
    truth.save(output_file)
Пример #15
0
    def process(self, features):
        """Applies sliding-window cepstral mean and/or variance normalization
        on `features` with the specified options

        Parameters
        ----------
        features : :class:`~shennong.features.Features`
            The input features.

        Returns
        -------
        slid_window_cmvn_feats : :class:`~shennong.features.Features`
            The normalized features.
        """
        data = kaldi.matrix.Matrix(*features.data.shape)
        kaldi.feat.functions.sliding_window_cmn(
            self._options, kaldi.matrix.SubMatrix(features.data), data)

        return Features(data.numpy(), features.times,
                        self.get_properties(features))
Пример #16
0
    def _load(self):
        self._log.info('loading directory "%s"', self.filename)

        # list all the csv and json files
        csv_files = list_files_with_extension(self.filename,
                                              '.csv',
                                              recursive=False)
        json_files = list_files_with_extension(self.filename,
                                               '.json',
                                               recursive=False)

        features = self._features_collection()

        # load the features one by one
        for csv in csv_files:
            self._log.debug('loading %s', csv)

            data_dtype, times_dtype, ndims = self._parse_header(csv)

            # read times and features
            data = np.loadtxt(csv)
            times = data[:, :data.shape[1] - ndims].astype(times_dtype)
            if times.shape[1] == 1:
                times = times.flatten()
            data = data[:, data.shape[1] - ndims:].astype(data_dtype)

            # read properties
            properties = {}
            json = csv.replace('.csv', '.json')
            if json in json_files:
                self._log.debug('loading %s', json)
                properties = dict(json_tricks.loads(open(json, 'r').read()))

            # build the features
            name = os.path.basename(csv).replace('.csv', '')
            features[name] = Features(data,
                                      times,
                                      properties=properties,
                                      validate=False)

        return features
Пример #17
0
    def process(self, features):
        """Compute deltas on `features` with the specified options

        Parameters
        ----------
        features : Features, shape = [nframes, ncols]
            The input features on which to compute the deltas

        Returns
        -------
        deltas : Features, shape = [nframes, ncols * (`order` + 1)]
            The computed deltas with as much orders as specified. The
            output features are the concatenation of the input
            `features` and it's time derivative at each orders.

        """
        data = kaldi.matrix.SubMatrix(
            kaldi.feat.functions.compute_deltas(
                self._options, kaldi.matrix.SubMatrix(features.data))).numpy()

        return Features(data, features.times, self.get_properties(features))
Пример #18
0
    def process(self, signal):
        """Computes energy on the input `signal`

        Parameters
        ----------
        signal : :class:`~signal.audio.audioData`

        Returns
        -------
        energy : :class:`~shennong.features.Features`
            The computed - and compressed - energy

        Raises
        ------
        ValueError
            If the input `signal` has more than one channel (i.e. is
            not mono). If `sample_rate` != `signal.sample_rate`.

        """
        # ensure the signal is correct
        if signal.nchannels != 1:
            raise ValueError(
                'signal must have one dimension, but it has {}'.format(
                    signal.nchannels))

        if self.sample_rate != signal.sample_rate:
            raise ValueError('processor and signal mismatch in sample rates: '
                             '{} != {}'.format(self.sample_rate,
                                               signal.sample_rate))

        if self.raw_energy:
            old_conf = self.get_params()
            self.preemph_coeff = 0
            self.window_type = 'rectangular'

        # number of frames in the framed signal
        nframes = kaldi.feat.window.num_frames(signal.nsamples,
                                               self._frame_options,
                                               flush=True)

        # a kaldi view of the numpy signal
        signal = kaldi.matrix.SubVector(signal.data)

        # windowing function to compute frames
        window = kaldi.feat.window.FeatureWindowFunction.from_options(
            self._frame_options)

        # compression function to compress energy
        compression = self._compression_fun[self._compression]

        # pre-allocate the resulting energy
        energy = np.zeros((nframes, 1))

        # pre-allocate a buffer for the frames, extract the frames and
        # compute the energy on them
        out_frame = kaldi.matrix.Vector(self._frame_options.window_size())
        for frame in range(nframes):
            kaldi.feat.window.extract_window(0, signal, frame,
                                             self._frame_options, window,
                                             out_frame)

            # square the signal, force float64 to avoid overflow
            square = np.square(out_frame.numpy(), dtype=np.float64)

            # avoid doing log on 0 (should be avoided already by
            # dithering, but who knows...)
            energy[frame] = compression(
                max(square.sum(),
                    np.finfo(np.float64).tiny))

        if self.raw_energy:
            self.set_params(**old_conf)

        return Features(energy, self.times(nframes), self.get_properties())
Пример #19
0
    def process(self, signal):
        """Computes bottleneck features on an audio `signal`

        Use a pre-trained neural network to extract bottleneck
        features. Features have a frame shift of 10 ms and frame
        length of 25 ms.

        Parameters
        ----------
        signal : Audio, shape = [nsamples, 1]
            The input audio signal to compute the features on, must be
            mono. The signal is up/down-sampled at 8 kHz during
            processing.

        Returns
        -------
        features : Features, shape = [nframes, 80]
            The computes bottleneck features will have as many rows as
            there are frames (depends on the `signal` duration, expect
            about 100 frames per second), each frame with 80
            dimensions.

        Raises
        ------
        RuntimeError
            If no speech is detected on the `signal` during the voice
            activity detection preprocessing step.

        """
        # force resampling to 8 kHz and 16 bits integers
        need_resample = (signal.sample_rate != 8000
                         or signal.dtype is not np.dtype(np.int16))

        if need_resample:
            self.log.debug('resampling audio from %dHz@%db to %dHz@%db',
                           signal.sample_rate, signal.dtype.itemsize * 8, 8000,
                           16)
            signal = signal.resample(8000).astype(np.int16)

        signal = signal.data

        # define parameters to extract mel filterbanks. Those
        # parameters cannot be tuned because the networks are trained
        # with them... frame_noverlap is the number of samples to
        # overlap in each frame, so the frame_shift is 200 - 120 = 80
        frame_length = 200
        frame_noverlap = 120
        frame_shift = frame_length - frame_noverlap

        # voice activity detection TODO implement user-provided VAD
        # (vad input format could be an instance of Alignment, or
        # simply an array of bool).
        vad = _compute_vad(signal,
                           self.log,
                           win_length=frame_length,
                           win_overlap=frame_noverlap)

        # ensure we have some voiced frames in the signal
        voiced_frames = sum(vad)
        if not voiced_frames:
            raise RuntimeError(
                'no voice detected in signal, failed to extract features')
        self.log.debug('%d frames of speech detected (on %d total frames)',
                       voiced_frames, len(vad))

        # from audio signal to mel filterbank
        signal = _add_dither(signal, self.dither)
        window = np.hamming(frame_length)
        fbank_mx = _mel_fbank_mx(window.size,
                                 8000,
                                 numchans=24,
                                 lofreq=64.0,
                                 hifreq=3800.0)
        fea = _fbank_htk(signal, window, frame_noverlap, fbank_mx)

        # center the mel features from voiced frames mean
        fea -= np.mean(fea[vad], axis=0)

        # add a global context to the mel features
        left_ctx = right_ctx = 15
        fea = np.r_[np.repeat(fea[[0]], left_ctx, axis=0), fea,
                    np.repeat(fea[[-1]], right_ctx, axis=0)]

        # compute the network output from mel features
        left_ctx_bn1 = right_ctx_bn1 = self._get_weights()['context']
        nn_input = _preprocess_nn_input(fea, left_ctx_bn1, right_ctx_bn1)
        nn_output = np.vstack(
            _create_nn_extract_st_BN(nn_input, self._get_weights(), 2)[0])

        # compute the timestamps for each output frame
        times = (1.0 / 8000) * np.vstack(
            (np.arange(nn_output.shape[0]) * frame_shift,
             np.arange(nn_output.shape[0]) * frame_shift + frame_length)).T

        # return the final bottleneck features
        return Features(nn_output, times, self.get_properties())
Пример #20
0
    def process(self, features, norm_vars=True, skip_dims=None, reverse=False):
        """Applies the accumulated CMVN statistics to the given ``features``

        Parameters
        ----------
        features : :class:`~shennong.features.features.Features`
            The input features on which to apply CMVN statisitics.

        norm_vars : bool, optional
            If False, do not apply variance normalization (only mean),
            default to True.

        skip_dims : list of positive integers, optional
            Dimensions for which to skip normalization. Default is to
            not skip any dimension.

        reverse : bool, optional
            Whether to apply CMVN in a reverse sense, so as to
            transform zero-mean, unit-variance features into features
            with the desired mean and variance.

        Returns
        -------
        cmvn_features : :class:`~shennong.features.features.Features`
            The normalized features

        Raises
        ------
        ValueError
            If no stats have been accumulated

        """
        # make sure we have accumulated some stats
        if self.count < 1.0:
            raise ValueError('insufficient accumulation of stats for CMVN, '
                             'must be >= 1.0 but is {}'.format(self.count))

        # skip dims in pykaldi is a destructive operation (alteration
        # of self.stats), so we work by copy here, to avoid modifying
        # statistics.
        if not skip_dims:
            cmvn = self._cmvn
        else:
            # make sure all skipped dims are valid dims
            dmin, dmax = min(skip_dims), max(skip_dims)
            if dmin < 0 or dmax >= features.ndims:
                raise ValueError(
                    'skipped dimensions must be in [0, {}[ but are in [{}, {}['
                    .format(features.ndims, dmin, dmax))

            # work by copy to not alter self.stats
            cmvn = kaldi.transform.cmvn.Cmvn(dim=self.dim)
            cmvn.stats = kaldi.matrix.DoubleMatrix(self.stats)
            cmvn.skip_dims(skip_dims)

        data = kaldi.matrix.SubMatrix(features.data)
        cmvn.apply(data, norm_vars=norm_vars, reverse=reverse)

        return Features(data.numpy(),
                        features.times,
                        properties=self.get_properties(features))
Пример #21
0
    def process(self, audio):
        """Extracts the (POV, pitch) from a given speech ``audio`` using CREPE.

        Parameters
        ----------
        audio : Audio
            The speech signal on which to estimate the pitch. Will be
            transparently resampled at 16kHz if needed.

        Returns
        -------
        raw_pitch_features : Features, shape = [nframes, 2]
            The output array has two columns corresponding to (POV,
            pitch). The output from the `crepe` module is reshaped to
            match the specified options `frame_shift` and `frame_length`.

        Raises
        ------
        ValueError
            If the input `signal` has more than one channel (i.e. is
            not mono).

        """
        if audio.nchannels != 1:
            raise ValueError(
                f'audio must have one channel but has {audio.nchannels}')

        if audio.sample_rate != self.sample_rate:
            self.log.debug('resampling audio to 16 kHz')
            audio = audio.resample(self.sample_rate)

        # raw activation matrix, shape=(T, 360)
        activation = self._get_activation(audio.data)

        # confidence is the confidence of voice activity, in [, 1], shape=(T,)
        confidence = activation.max(axis=1)

        if self.viterbi:
            cents = _to_viterbi_cents(activation)
        else:
            cents = _to_local_average_cents(activation)

        # frequency is the predicted pitch value in Hz, shape=(T,) and
        frequency = 10 * 2**(cents / 1200)
        frequency[np.isnan(frequency)] = 0

        # number of samples in the resampled signal
        hop_length = np.round(self.sample_rate * self.frame_shift).astype(int)
        nsamples = 1 + int(
            (audio.shape[0] - self.frame_length * self.sample_rate) /
            hop_length)

        # scipy method issues warnings we want to inhibit
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=FutureWarning)
            data = scipy.signal.resample(
                np.array([confidence, frequency]).T, nsamples)

        # hack needed beacause resample confidence
        data[data[:, 0] < 1e-2, 0] = 0
        data[data[:, 0] > 1, 0] = 1

        return Features(data,
                        self.times(data.shape[0]),
                        properties=self.get_properties())
Пример #22
0
    def process(self, crepe_pitch):
        """Post process a raw pitch data as specified by the options

        Parameters
        ----------
        crepe_pitch : Features, shape = [n, 2]
            The pitch as extracted by the `CrepePitchProcessor.process`
            method

        Returns
        -------
        pitch : Features, shape = [n, 1 2 3 or 4]
            The post-processed pitch usable as speech features. The
            output columns are 'pov_feature', 'normalized_log_pitch',
            delta_pitch' and 'raw_log_pitch', in that order,if their
            respective options are set to True.

        Raises
        ------
        ValueError
            If after interpolation some pitch values are not positive.
            If `raw_pitch` has not exactly two columns. If all the
            following options are False: 'add_pov_feature',
            'add_normalized_log_pitch', 'add_delta_pitch' and
            'add_raw_log_pitch' (at least one of them must be True).

        """
        # check at least one required option is True
        if not (self.add_pov_feature or self.add_normalized_log_pitch
                or self.add_delta_pitch or self.add_raw_log_pitch):
            raise ValueError(
                'at least one of the following options must be True: '
                'add_pov_feature, add_normalized_log_pitch, '
                'add_delta_pitch, add_raw_log_pitch')

        if crepe_pitch.shape[1] != 2:
            raise ValueError(
                'data shape must be (_, 2), but it is (_, {})'.format(
                    crepe_pitch.shape[1]))

        # Interpolate pitch values for unvoiced frames
        to_remove = predict_voicing(crepe_pitch.data[:, 0]) == 0
        if np.all(to_remove):
            raise ValueError('No voiced frames')

        data = crepe_pitch.data[:, 1].copy()
        indexes_to_keep = np.where(~to_remove)[0]
        first, last = indexes_to_keep[0], indexes_to_keep[-1]
        first_value, last_value = data[first], data[last]

        interp = scipy.interpolate.interp1d(indexes_to_keep,
                                            data[indexes_to_keep],
                                            fill_value='extrapolate')
        data[to_remove] = interp(np.where(to_remove)[0])
        data[:first] = first_value
        data[last:] = last_value

        if not np.all(data > 0):
            raise ValueError('Not all pitch values are positive: issue with '
                             'extracted pitch or interpolation')

        # Converts POV into NCCF
        nccf = []
        for sample in crepe_pitch.data[:, 0]:
            if sample in [0, 1]:
                nccf.append(sample)
            else:
                nccf.append(
                    scipy.optimize.bisect(
                        functools.partial(lambda x, y: _nccf_to_pov(x) - y,
                                          y=sample), 0, 1))

        return super(CrepePitchPostProcessor, self).process(
            Features(
                np.vstack((nccf, data)).T, crepe_pitch.times,
                crepe_pitch.properties))
Пример #23
0
    def process(self, utterances, ubm=None, group_by='utterance', njobs=1):
        """Compute the VTLN warp factors for the given utterances.

        If the ``by_speaker`` option is set to True before the call to
        :func:`process()`, the warps are computed on per speaker basis (i.e.
        each utterance of the same speaker has an identical warp). If
        ``per_speaker`` is False, the warps are computed on a per-utterance
        basis.

        Parameters
        ----------
        utterances : :class:`~shennong.utterances.Utterances`
            The list of utterances to train the VTLN on.
        ubm : DiagUbmProcessor, optional
            If provided, uses this UBM instead of computing a new one.
        group_by : str, optional
            Must be 'utterance' or 'speaker'.
        njobs : int, optional
            Number of threads to use for computation, default to 1.

        Returns
        -------
        warps : dict[str, float]
            Warps computed for each speaker or utterance, according to
            ``group_by``. If by speaker: same warp for all utterances of this
            speaker.

        """
        if group_by not in ('utterance', 'speaker'):
            raise ValueError(
                f'group_by must be "utterance" or "speaker", '
                f'it is: {group_by}')
        if group_by == 'speaker' and not self.by_speaker:
            raise ValueError(
                'Asking to group warps by speaker but they are computed '
                'per utterance, please set VtlnProcessor.by_speaker to True')
        if self.by_speaker and not utterances.has_speakers():
            raise ValueError(
                'Requested speaker based VTLN, but speaker'
                ' information is missing')

        utt2speak = None
        if self.by_speaker:
            utt2speak = {utt.name: utt.speaker for utt in utterances}

        # Min / max warp
        if self.min_warp > self.max_warp:
            raise ValueError(
                f'Min warp > max warp: {self.min_warp} > {self.max_warp}')

        # UBM-GMM
        if ubm is None:
            ubm = DiagUbmProcessor(**self.ubm)
            ubm.log.setLevel(self.log.getEffectiveLevel())
            ubm.process(utterances, njobs=njobs)
        else:
            if ubm.gmm is None:
                raise ValueError('Given UBM-GMM has not been trained')
            self.ubm = ubm.get_params()

        self.log.info('Initializing base LVTLN transforms')
        dim = ubm.gmm.dim()
        num_classes = int(1.5 + (self.max_warp-self.min_warp) / self.warp_step)
        default_class = int(0.5 + (1-self.min_warp)/self.warp_step)
        self.lvtln = kaldi.transform.lvtln.LinearVtln.new(
            dim, num_classes, default_class)

        cmvn_config = self.features.pop('sliding_window_cmvn', None)

        raw_mfcc = pipeline.extract_features(
            self.features, utterances, njobs=njobs, log=null_logger())

        # Compute VAD decision
        self.log.debug('... computing VAD decision')
        vad = {}
        for utt, mfcc in raw_mfcc.items():
            this_vad = VadPostProcessor(**ubm.vad).process(mfcc)
            vad[utt] = this_vad.data.reshape(
                (this_vad.shape[0],)).astype(bool)

        # Apply cmvn sliding
        orig_features = FeaturesCollection()
        if cmvn_config is not None:
            proc = SlidingWindowCmvnPostProcessor(**cmvn_config)
            for utt, mfcc in raw_mfcc.items():
                orig_features[utt] = proc.process(mfcc)
        else:
            orig_features = raw_mfcc

        # Select voiced frames
        orig_features = orig_features.trim(vad)
        orig_features = FeaturesCollection(  # Subsample
            {utt: feats.copy(subsample=self.subsample)
             for utt, feats in orig_features.items()})

        # Computing base transforms
        featsub_unwarped = pipeline.extract_features(
            self.features, utterances,
            njobs=njobs, log=null_logger()).trim(vad)
        featsub_unwarped = FeaturesCollection(
            {utt: feats.copy(subsample=self.subsample)
             for utt, feats in featsub_unwarped.items()})

        for c in range(num_classes):
            this_warp = self.min_warp + c*self.warp_step
            self.log.info(
                'Computing base transform (warp=%s) %s/%s',
                this_warp, c+1, num_classes)

            featsub_warped = pipeline.extract_features_warp(
                self.features, utterances, this_warp,
                null_logger(), njobs=njobs).trim(vad)
            featsub_warped = FeaturesCollection(
                {utt: feats.copy(subsample=self.subsample)
                 for utt, feats in featsub_warped.items()})
            self.compute_mapping_transform(
                featsub_unwarped, featsub_warped, c, this_warp)

        del featsub_warped, featsub_unwarped, vad

        if cmvn_config is not None:
            self.features['sliding_window_cmvn'] = cmvn_config

        self.log.debug('Computing Gaussian selection info')
        ubm.gaussian_selection(orig_features)

        self.log.info(
            'Computing LVTLN transforms (%s iterations)', self.num_iters)
        posteriors = ubm.gaussian_selection_to_post(orig_features)
        self.transforms, self.warps = self.estimate(
            ubm, orig_features, posteriors, utt2speak)

        for i in range(self.num_iters):
            self.log.debug('Updating model on pass %s/%s', i+1, self.num_iters)

            # Transform the features
            features = FeaturesCollection()
            for utt, feats in orig_features.items():
                ind = utt if utt2speak is None else utt2speak[utt]
                linear_part = self.transforms[ind][:, : feats.ndims]
                offset = self.transforms[ind][:, feats.ndims]
                data = np.dot(feats.data, linear_part.numpy().T) + \
                    offset.numpy()
                features[utt] = Features(data, feats.times, feats.properties)

            # Update the model
            gmm_accs = ubm.accumulate(features, njobs=njobs)
            ubm.estimate(gmm_accs)

            # Now update the LVTLN transforms (and warps)
            # self.log.debug('Re-estimating LVTLN transforms on pass %s', i+1)
            posteriors = ubm.gaussian_selection_to_post(features)
            self.transforms, self.warps = self.estimate(
                ubm, orig_features, posteriors, utt2speak)

        if self.by_speaker:
            self.transforms = {
                utt: self.transforms[spk]
                for utt, spk in utt2speak.items()}
            self.warps = {
                utt: self.warps[spk]
                for utt, spk in utt2speak.items()}

        self.log.info('Done training LVTLN model')
        if group_by == 'utterance':
            return self.warps
        # group_by == 'speaker'
        return {
            spk: self.warps[utts[0].name]
            for spk, utts in utterances.by_speaker().items()}