Exemplo n.º 1
0
def main(_):
    # Read WAV file.
    samples, sample_rate_hz = dsp.read_wav_file(FLAGS.input, dtype=np.float32)
    samples = samples.mean(axis=1)

    # Run frontend to get CARL frames. The classifier expects input sample rate
    # CLASSIFIER_INPUT_HZ, block_size=128, pcen_cross_channel_diffusivity=60, and
    # otherwise the default frontend settings.
    carl = frontend.CarlFrontend(input_sample_rate_hz=CLASSIFIER_INPUT_HZ,
                                 block_size=128,
                                 pcen_cross_channel_diffusivity=60.0)
    if sample_rate_hz != CLASSIFIER_INPUT_HZ:
        resampler = dsp.Resampler(sample_rate_hz, CLASSIFIER_INPUT_HZ)
        samples = resampler.process_samples(samples)
    frames = phone_util.run_frontend(carl, samples)
    # The frame rate is 125Hz (hop size of 8ms).
    frame_rate = CLASSIFIER_INPUT_HZ / carl.block_size

    timeseries = {}
    for window in sliding_window(frames, classify_phoneme.NUM_FRAMES):
        # Run classifier inference on the current window.
        scores = classify_phoneme.classify_phoneme_scores(window)
        append_to_dict(timeseries, scores)

    fig_combined, fig_phoneme = plot_output(frames, frame_rate, timeseries,
                                            os.path.basename(FLAGS.input))

    if FLAGS.output:  # Save plot as an image file.
        stem, ext = os.path.splitext(FLAGS.output)
        plot.save_figure(stem + '-combined' + ext, fig_combined)
        plot.save_figure(stem + '-phoneme' + ext, fig_phoneme)
    else:  # Show plot interactively.
        plt.show()
    return 0
Exemplo n.º 2
0
    def test_rational_approximation_options(self):
        """Test that rational approximation options work as expected."""
        # Request a resampling factor of pi with default options.
        resampler = dsp.Resampler(np.pi, 1.0)
        self.assertEqual(resampler.rational_factor,
                         fractions.Fraction(355, 113))

        # Truncate continued fraction expansion at 3 terms.
        resampler = dsp.Resampler(np.pi,
                                  1.0,
                                  rational_approximation_max_terms=3)
        self.assertEqual(resampler.rational_factor,
                         fractions.Fraction(333,
                                            106))  # 3rd convergent [3; 7, 15].

        # Truncate when continued fraction residual is less than 0.1.
        resampler = dsp.Resampler(
            np.pi, 1.0, rational_approximation_convergence_tolerance=0.1)
        self.assertEqual(resampler.rational_factor,
                         fractions.Fraction(22, 7))  # 2nd convergent, [3; 7].
Exemplo n.º 3
0
    def test_compare_with_reference_resampler(self):
        """Compare Resampler to _reference_resampling() implementation."""
        np.random.seed(0)

        for filter_radius_factor in (4.0, 5.0, 17.0):
            num_channels_list = (1, 2,
                                 3) if filter_radius_factor == 5.0 else (1, )
            for num_channels in num_channels_list:
                input_samples = -0.5 + np.random.rand(50, num_channels)
                for input_sample_rate_hz in RATES:
                    for output_sample_rate_hz in RATES:
                        options = {
                            'input_sample_rate_hz': input_sample_rate_hz,
                            'output_sample_rate_hz': output_sample_rate_hz,
                            'filter_radius_factor': filter_radius_factor
                        }
                        message = make_message(options)
                        resampler = dsp.Resampler(**options,
                                                  num_channels=num_channels)
                        self.assertEqual(resampler.num_channels,
                                         num_channels,
                                         msg=message)

                        output = resampler.process_samples(input_samples)

                        kernel = dsp.ResamplerKernel(**options)
                        self.assertAlmostEqual(float(
                            resampler.rational_factor),
                                               kernel.factor,
                                               delta=5e-4,
                                               msg=message)
                        self.assertEqual(resampler.flush_frames,
                                         2 * np.ceil(kernel.radius),
                                         msg=message)

                        expected = self._reference_resampling(
                            kernel, resampler.rational_factor, input_samples)
                        self.assertAlmostEqual(len(output),
                                               len(expected),
                                               delta=2,
                                               msg=message)

                        min_size = min(len(output), len(expected))
                        np.testing.assert_allclose(output[:min_size],
                                                   expected[:min_size],
                                                   atol=5e-7,
                                                   err_msg=message)
Exemplo n.º 4
0
    def test_resample_chirp(self):
        """Test Resampler on a chirp signal for various sample rates."""
        duration_s = 0.025

        for input_sample_rate_hz in RATES:
            max_frequency_hz = 0.45 * input_sample_rate_hz
            chirp_slope = max_frequency_hz / duration_s

            input_size = int(duration_s * input_sample_rate_hz)
            t = np.arange(input_size) / input_sample_rate_hz
            input_samples = np.sin(np.pi * chirp_slope * t**2).astype(
                np.float32)

            for output_sample_rate_hz in RATES:
                options = {
                    'input_sample_rate_hz': input_sample_rate_hz,
                    'output_sample_rate_hz': output_sample_rate_hz
                }
                message = make_message(options)
                resampler = dsp.Resampler(**options)
                # Run resampler on the chirp.
                output_samples = resampler.process_samples(input_samples)

                kernel = dsp.ResamplerKernel(**options)
                cutoff_hz = (kernel.radians_per_sample * input_sample_rate_hz /
                             (2 * np.pi))
                t = np.arange(len(output_samples)) / output_sample_rate_hz
                # Compute the chirp's instantaneous frequency at t.
                chirp_frequency_hz = chirp_slope * t

                # Expect output near zero when chirp frequency is above cutoff_hz.
                expected = (
                    (chirp_frequency_hz < cutoff_hz) *
                    np.sin(np.pi * chirp_slope * t**2).astype(np.float32))
                # Skip samples in the transition between passband and stopband.
                mask = np.abs(chirp_frequency_hz -
                              cutoff_hz) >= 0.3 * cutoff_hz

                np.testing.assert_allclose(output_samples[mask],
                                           expected[mask],
                                           atol=0.04,
                                           err_msg=message)
Exemplo n.º 5
0
    def test_streaming_random_block_size(self):
        """Test Resampler streaming works by passing blocks of random sizes."""
        np.random.seed(0)
        input_samples = np.random.randn(500).astype(np.float32)
        max_block_size = 20

        for input_sample_rate_hz in RATES:
            for output_sample_rate_hz in RATES:
                options = {
                    'input_sample_rate_hz': input_sample_rate_hz,
                    'output_sample_rate_hz': output_sample_rate_hz
                }
                message = make_message(options)
                resampler = dsp.Resampler(**options)

                # Do "streaming" resampling, passing successive blocks of input.
                streaming = []
                n = 0
                while n < len(input_samples):
                    input_block_size = int(np.random.rand() * max_block_size)
                    input_block = input_samples[n:n + input_block_size]
                    n += input_block_size
                    # Resample the block.
                    output_block = resampler.process_samples(input_block)
                    streaming.append(output_block)

                streaming = np.hstack(streaming)

                resampler.reset()
                # Do "nonstreaming" resampling, processing the whole input at once.
                nonstreaming = resampler.process_samples(input_samples)

                # Streaming vs. nonstreaming outputs should match.
                np.testing.assert_allclose(streaming,
                                           nonstreaming,
                                           atol=1e-6,
                                           err_msg=message)
Exemplo n.º 6
0
    def test_resample_sine_wave(self):
        """Test Resampler on a sine wave for various sample rates."""
        frequency = 1100.7

        for input_sample_rate_hz in RATES:
            radians_per_sample = 2 * np.pi * frequency / input_sample_rate_hz
            input_samples = np.sin(radians_per_sample * np.arange(100))

            for output_sample_rate_hz in RATES:
                options = {
                    'input_sample_rate_hz': input_sample_rate_hz,
                    'output_sample_rate_hz': output_sample_rate_hz
                }
                message = make_message(options)
                resampler = dsp.Resampler(**options)
                # Run resampler on sine wave samples.
                output_samples = resampler.process_samples(input_samples)

                kernel = dsp.ResamplerKernel(input_sample_rate_hz,
                                             output_sample_rate_hz)
                expected_size = (len(input_samples) -
                                 kernel.radius) / kernel.factor
                self.assertAlmostEqual(len(output_samples),
                                       expected_size,
                                       delta=1.0,
                                       msg=message)

                radians_per_sample = 2 * np.pi * frequency / output_sample_rate_hz
                expected = np.sin(radians_per_sample *
                                  np.arange(len(output_samples)))
                # We ignore the first few output samples because they depend on input
                # samples at negative times, which are extrapolated as zeros.
                num_to_ignore = 1 + int(kernel.radius / kernel.factor)
                np.testing.assert_allclose(output_samples[num_to_ignore:],
                                           expected[num_to_ignore:],
                                           atol=0.005,
                                           err_msg=message)
Exemplo n.º 7
0
def process_one_wav_file(wav_file: str) -> Dict[str, List[np.ndarray]]:
  """Processes one WAV file to create observed frames.

  Processes one TIMIT WAV file with the frontend, and uses the associated label
  file to group observed frames by phone. Segments shorter than
  FLAGS.min_phone_length_s or with labels in PHONES_TO_EXCLUDE_FROM_DATASET are
  skipped.

  Audio channels are averaged (if there are multiple channels) to reduce to mono
  before processing.

  Args:
    wav_file: String, WAV file path.
  Returns:
    Examples dict with values of shape (num_examples, num_frames, num_channels).
    `examples[phone][i]` is the input for the ith example with label `phone`.
  """
  samples_orig, sample_rate_hz = dsp.read_wav_file(wav_file, dtype=np.float32)
  samples_orig = samples_orig.mean(axis=1)

  phone_times = phone_util.get_phone_times(
      phone_util.get_phone_label_filename(wav_file))
  frontend = carl_frontend.CarlFrontend(**get_frontend_params_from_flags())
  examples = collections.defaultdict(list)
  translation = 0

  for draw_index in range(FLAGS.num_draws):
    samples = np.copy(samples_orig)

    # Resample from sample_rate_hz to AUDIO_SAMPLE_RATE_HZ, perturbed up to
    # +/-max_resample_percent to change pitch and compress/dilate time.
    # TODO: For more data augmentation, consider changing pitch and
    # time stretching independently.
    dilation_factor = AUDIO_SAMPLE_RATE_HZ / sample_rate_hz
    if draw_index > 0:
      max_log_dilation = np.log(1.0 + FLAGS.max_resample_percent / 100.0)
      dilation_factor *= np.exp(
          np.random.uniform(-max_log_dilation, max_log_dilation))

    if abs(dilation_factor - 1.0) >= 1e-4:
      resampler = dsp.Resampler(1.0, dilation_factor, max_denominator=2000)
      samples = resampler.process_samples(samples)

    if draw_index > 0:
      # Prepend a random fraction of a block of silence. This randomizes the
      # input phase with respect to the frontend's decimation by block_size.
      translation = np.random.randint(FLAGS.block_size)
      samples = np.append(np.zeros(translation), samples)
      # Add white Gaussian noise.
      samples = np.random.normal(
          samples, FLAGS.noise_stddev).astype(np.float32)
      # Scale the samples to simulate the recording at a different distance.
      samples /= np.exp(np.random.uniform(
          np.log(FLAGS.min_simulated_distance),
          np.log(FLAGS.max_simulated_distance)))

    observed = phone_util.run_frontend(frontend, samples)

    for start, end, phone in phone_times:
      start = int(round(dilation_factor * start)) + translation
      end = min(int(round(dilation_factor * end)), len(samples)) + translation
      phone_length_s = float(end - start) / sample_rate_hz

      # Skip short (quickly-spoken) phone segments. They are likely influenced
      # by preceding/following phones, making classification is less clear.
      if phone_length_s < FLAGS.min_phone_length_s:
        continue  # Skip short phone.

      phone = COALESCE_SIMILAR_PHONES.get(phone, phone)

      if phone in PHONES_TO_EXCLUDE_FROM_DATASET:
        continue

      # There may be confusing transitions (or possible labeling inaccuracy)
      # near the segment endpoints, so trim a fraction from each end.
      length = end - start
      start += int(round(length * FLAGS.phone_trim_left))
      end -= int(round(length * FLAGS.phone_trim_right))

      # Convert sample indices from audio sample rate to frame rate.
      start //= FLAGS.block_size
      end //= FLAGS.block_size

      left_context = FLAGS.num_frames_left_context
      # Extract a window every `hop` frames and append to examples.
      examples[phone].append(sliding_window(
          observed[max(0, start - left_context):end],
          window_size=left_context + 1,
          hop=FLAGS.downsample_factor // frontend.block_size))

  return examples