Пример #1
0
 def process(self, name, X):
   # ====== not enough data points for sequencing ====== #
   if self.end == 'cut' and \
   any(x.shape[0] < self.frame_length for x in X):
     return None
   if self.end == 'ignore' and \
   any(x.shape[0] > self.frame_length for x in X):
     return None
   end = self.end
   if end == 'ignore':
     end = 'pad'
   # ====== preprocessing data-idx, label-idx ====== #
   data_idx = axis_normalize(axis=self.data_idx, ndim=len(X),
                             return_tuple=True)
   # ====== segments X ====== #
   X_new = []
   for idx, x in enumerate(X):
     ## for data
     if idx in data_idx:
       if end == 'mix':
         x = segment_axis(a=x,
                          frame_length=self.frame_length,
                          step_length=self.step_length, axis=0,
                          end='cut' if x.shape[0] >= self.frame_length else 'pad',
                          pad_value=self.pad_value, pad_mode=self.pad_mode)
       else:
         x = segment_axis(a=x,
                          frame_length=self.frame_length,
                          step_length=self.step_length, axis=0,
                          end=end, pad_value=self.pad_value,
                          pad_mode=self.pad_mode)
     ## for all
     X_new.append(x)
   return name, X_new
Пример #2
0
 def process(self, name, X):
   # not enough data points for stacking
   if X[0].shape[0] < self.frame_length:
     return None
   data_idx, label_idx = _get_data_label_idx(
       self.data_idx, self.label_idx, len(X))
   # ====== stacking  ====== #
   X_new = []
   for idx, x in enumerate(X):
     if idx in data_idx:
       if x.ndim == 1:
         x = np.expand_dims(x, axis=-1)
       x = stack_frames(x, frame_length=self.frame_length,
                        step_length=self.shift,
                        keep_length=self.keep_length)
     elif idx in label_idx:
       if not self.keep_length:
         x = segment_axis(x, frame_length=self.frame_length,
                          step_length=self.shift, axis=0,
                          end='cut')
         x = _apply_label_mode(x, self.label_mode)
       else:
         raise NotImplementedError # TODO
     X_new.append(x)
   return name, X_new
Пример #3
0
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321):
    """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
    # Load dataset
    frame_length = int(utt_length / FRAME_SHIFT)
    ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True)
    X = ds[feat]
    train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()}
    test_indices = {
        name: start_end
        for name, start_end in ds['indices'].items() if name not in TRAIN_DATA
    }
    train_indices, valid_indices = train_valid_test_split(x=list(
        train_indices.items()),
                                                          train=0.9,
                                                          inc_test=False,
                                                          seed=seed)
    all_speakers = sorted(set(TRAIN_DATA.values()))
    n_speakers = max(all_speakers) + 1
    print("#Train files:", ctext(len(train_indices), 'cyan'))
    print("#Valid files:", ctext(len(valid_indices), 'cyan'))
    print("#Test files:", ctext(len(test_indices), 'cyan'))
    print("#Speakers:", ctext(n_speakers, 'cyan'))
    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post',
                             data_idx=0),
        F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0),
        F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
    ]
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=7,
                            buffer_size=12)
    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=2,
                            buffer_size=4)
    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    # ====== cache the test data ====== #
    cache_dat = os.path.join(PATH_EXP,
                             'test_%s_%d.dat' % (feat, int(utt_length)))
    cache_ids = os.path.join(PATH_EXP,
                             'test_%s_%d.ids' % (feat, int(utt_length)))
    # validate cache files
    if os.path.exists(cache_ids):
        with open(cache_ids, 'rb') as f:
            ids = pickle.load(f)
        if len(ids) != len(test_indices):
            os.remove(cache_ids)
            if os.path.exists(cache_dat):
                os.remove(cache_dat)
    elif os.path.exists(cache_dat):
        os.remove(cache_dat)
    # caching
    if not os.path.exists(cache_dat):
        dat = F.MmapData(cache_dat,
                         dtype='float16',
                         shape=(0, frame_length, X.shape[1]))
        ids = {}
        prog = Progbar(target=len(test_indices))
        s = 0
        for name, (start, end) in test_indices.items():
            y = X[start:end]
            y = segment_axis(y,
                             axis=0,
                             frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post')
            dat.append(y)
            # update indices
            ids[name] = (s, s + len(y))
            s += len(y)
            # update progress
            prog.add(1)
        dat.flush()
        dat.close()
        with open(cache_ids, 'wb') as f:
            pickle.dump(ids, f)
    # ====== re-load ====== #
    dat = F.MmapData(cache_dat, read_only=True)
    with open(cache_ids, 'rb') as f:
        ids = pickle.load(f)
    # ====== save some sample ====== #
    sample_path = os.path.join(PATH_EXP,
                               'test_%s_%d.pdf' % (feat, int(utt_length)))
    V.plot_figure(nrow=9, ncol=6)
    for i, (name, (start, end)) in enumerate(
            sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]),
                          k=12,
                          seed=87654321)):
        x = dat[start:end][:].astype('float32')
        ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                                ax=(12, 1, i + 1),
                                title='')
        ax.set_title(name)
    V.plot_save(sample_path)
    return (train_feeder, valid_feeder, ids, dat, all_speakers)
Пример #4
0
def prepare_dnn_data(recipe, feat, utt_length, seed=52181208):
  """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
  # Load dataset
  frame_length = int(utt_length / FRAME_SHIFT)
  ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe),
                 read_only=True)
  X = ds[feat]
  train_indices = {name: ds['indices'][name]
                   for name in TRAIN_DATA.keys()}
  test_indices = {name: start_end
                  for name, start_end in ds['indices'].items()
                  if name not in TRAIN_DATA}
  train_indices, valid_indices = train_valid_test_split(
      x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed)
  all_speakers = sorted(set(TRAIN_DATA.values()))
  n_speakers = max(all_speakers) + 1
  print("#Train files:", ctext(len(train_indices), 'cyan'))
  print("#Valid files:", ctext(len(valid_indices), 'cyan'))
  print("#Test files:", ctext(len(test_indices), 'cyan'))
  print("#Speakers:", ctext(n_speakers, 'cyan'))
  recipes = [
      F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length,
                           end='pad', pad_value=0, pad_mode='post',
                           data_idx=0),
      F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0),
      F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
  ]
  train_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X, indices=train_indices),
      batch_mode='batch', ncpu=7, buffer_size=12)
  valid_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X, indices=valid_indices),
      batch_mode='batch', ncpu=2, buffer_size=4)
  train_feeder.set_recipes(recipes)
  valid_feeder.set_recipes(recipes)
  print(train_feeder)
  # ====== cache the test data ====== #
  cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length)))
  cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length)))
  # validate cache files
  if os.path.exists(cache_ids):
    with open(cache_ids, 'rb') as f:
      ids = pickle.load(f)
    if len(ids) != len(test_indices):
      os.remove(cache_ids)
      if os.path.exists(cache_dat):
        os.remove(cache_dat)
  elif os.path.exists(cache_dat):
    os.remove(cache_dat)
  # caching
  if not os.path.exists(cache_dat):
    dat = F.MmapData(cache_dat, dtype='float16',
                     shape=(0, frame_length, X.shape[1]))
    ids = {}
    prog = Progbar(target=len(test_indices))
    s = 0
    for name, (start, end) in test_indices.items():
      y = X[start:end]
      y = segment_axis(y, axis=0,
                       frame_length=frame_length, step_length=frame_length,
                       end='pad', pad_value=0, pad_mode='post')
      dat.append(y)
      # update indices
      ids[name] = (s, s + len(y))
      s += len(y)
      # update progress
      prog.add(1)
    dat.flush()
    dat.close()
    with open(cache_ids, 'wb') as f:
      pickle.dump(ids, f)
  # ====== re-load ====== #
  dat = F.MmapData(cache_dat, read_only=True)
  with open(cache_ids, 'rb') as f:
    ids = pickle.load(f)
  # ====== save some sample ====== #
  sample_path = os.path.join(PATH_EXP,
                             'test_%s_%d.pdf' % (feat, int(utt_length)))
  V.plot_figure(nrow=9, ncol=6)
  for i, (name, (start, end)) in enumerate(
      sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)):
    x = dat[start:end][:].astype('float32')
    ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                            ax=(12, 1, i + 1), title='')
    ax.set_title(name)
  V.plot_save(sample_path)
  return (train_feeder, valid_feeder,
          ids, dat, all_speakers)
Пример #5
0
def test_func(s, sr, maximum_duration=30, minimum_duration=None,
              frame_length=256, nb_mixtures=3, threshold=0.3,
              return_vad=False, return_voices=False, return_cut=False):
    """ Splitting an audio based on VAD indicator.
    * The audio is segmented into multiple with length given by `frame_length`
    * Log-energy is calculated for each frames
    * Gaussian mixtures with `nb_mixtures` is fitted, and output vad indicator
      for each frames.
    * A flat window (ones-window) of `frame_length` is convolved with the
      vad indices.
    * All frames within the percentile >= `threshold` is treated as voiced.
    * The splitting process is greedy, frames is grouped until reaching the
      maximum duration

    Parameters
    ----------
    s: 1-D numpy.ndarray
        loaded audio array
    sr: int
        sample rate
    maximum_duration: float (second)
        maximum duration of each segments in seconds
    minimum_duration: None, or float (second)
        all segments below this length will be merged into longer segments,
        if None, any segments with half of the `maximum_duration`
        are considered.
    frame_length: int
        number of frames for windowing
    nb_mixtures: int
        number of Gaussian mixture for energy-based VAD (the higher
        the more overfitting).
    threshold: float (0. to 1.)
        The higher the values, the more frames are considered as voiced,
        this value is the lower percentile of voiced frames.
    return_vad: bool
        if True, return VAD confident values
    return_voices: bool
        if True, return the voices frames indices
    return_cut: bool
        if True, return the cut points of the audio.

    Return
    ------
    segments: list of audio arrays
    vad (optional): list of 0, 1 for VAD indices
    voices (optional): list of thresholded VAD for more precise voices frames.
    cut (optional): list of indicator 0, 1 (1 for the cut point)

    Note
    ----
    this function does not guarantee the output audio length is always
    smaller than `maximum_duration`, the higher the threshold, the better
    chance you get everything smaller than `maximum_duration`
    """
    frame_length = int(frame_length)
    maximum_duration = maximum_duration * sr
    results = []
    # ====== check if audio long enough ====== #
    if len(s) < maximum_duration:
        if return_cut or return_vad or return_voices:
            raise ValueError("Cannot return `cut` points, `vad` or `voices` since"
                        "the original audio is shorter than `maximum_duration`, "
                        "hence, no need for splitting.")
        return [s]
    maximum_duration /= frame_length
    if minimum_duration is None:
        minimum_duration = maximum_duration // 2
    else:
        minimum_duration = minimum_duration * sr / frame_length
        minimum_duration = np.clip(minimum_duration, 0., 0.99 * maximum_duration)
    # ====== start spliting ====== #
    frames = signal.segment_axis(s, frame_length, frame_length,
                                 axis=0, end='pad', endvalue=0.)
    energy = signal.get_energy(frames, log=True)
    vad = signal.vad_energy(energy, distrib_nb=nb_mixtures, nb_train_it=33)[0]
    vad = signal.smooth(vad, win=frame_length, window='flat')
    # explicitly return VAD
    if return_vad:
        results.append(vad)
    # ====== get all possible sliences ====== #
    # all voice indices
    indices = np.where(vad >= np.percentile(vad, q=threshold * 100))[0].tolist()
    if len(vad) - 1 not in indices:
        indices.append(len(vad) - 1)
    # explicitly return voiced frames
    if return_voices:
        tmp = np.zeros(shape=(len(vad),))
        tmp[indices] = 1
        results.append(tmp)
    # ====== spliting the audio ====== #
    segments = []
    start = 0
    prev_end = 0
    # greedy adding new frames to reach desire maximum length
    for end in indices:
        # over-reach the maximum length
        if end - start > maximum_duration:
            segments.append((start, prev_end))
            start = prev_end
        # exact maximum length
        elif end - start == maximum_duration:
            segments.append((start, end))
            start = end
        prev_end = end
    # if found NO segments just return original file
    if len(segments) == 0:
        return [s]
    # add ending index if necessary
    if indices[-1] != segments[-1][-1]:
        segments.append((start, indices[-1]))
    # re-fining, short segments will be merged into bigger onces
    found_under_length = True
    while found_under_length:
        new_segments = []
        found_under_length = False
        for (s1, e1), (s2, e2) in zip(segments, segments[1:]):
            # merge if length < length_threshold
            if (e1 - s1) < minimum_duration or (e2 - s2) < minimum_duration:
                new_segments.append((s1, e2))
                found_under_length = True
            # keep both of the segments
            else:
                new_segments.append((s1, e1))
                new_segments.append((s2, e2))
        segments = new_segments
    # explicitly return cut points
    if return_cut:
        tmp = np.zeros(shape=(segments[-1][-1] + 1,))
        for i, j in segments:
            tmp[i] = 1; tmp[j] = 1
        results.append(tmp)
    # ====== convert everythng to raw signal index ====== #
    segments = [[i * frame_length, j * frame_length]
                for i, j in segments]
    segments[-1][-1] = s.shape[0]
    # cut segments out of raw audio array
    segments = [s[i:j] for i, j in segments]
    results = [segments] + results
    return results[0] if len(results) == 1 else results
Пример #6
0
def test_func(s,
              sr,
              maximum_duration=30,
              minimum_duration=None,
              frame_length=256,
              nb_mixtures=3,
              threshold=0.3,
              return_vad=False,
              return_voices=False,
              return_cut=False):
    """ Splitting an audio based on VAD indicator.
    * The audio is segmented into multiple with length given by `frame_length`
    * Log-energy is calculated for each frames
    * Gaussian mixtures with `nb_mixtures` is fitted, and output vad indicator
      for each frames.
    * A flat window (ones-window) of `frame_length` is convolved with the
      vad indices.
    * All frames within the percentile >= `threshold` is treated as voiced.
    * The splitting process is greedy, frames is grouped until reaching the
      maximum duration

    Parameters
    ----------
    s: 1-D numpy.ndarray
        loaded audio array
    sr: int
        sample rate
    maximum_duration: float (second)
        maximum duration of each segments in seconds
    minimum_duration: None, or float (second)
        all segments below this length will be merged into longer segments,
        if None, any segments with half of the `maximum_duration`
        are considered.
    frame_length: int
        number of frames for windowing
    nb_mixtures: int
        number of Gaussian mixture for energy-based VAD (the higher
        the more overfitting).
    threshold: float (0. to 1.)
        The higher the values, the more frames are considered as voiced,
        this value is the lower percentile of voiced frames.
    return_vad: bool
        if True, return VAD confident values
    return_voices: bool
        if True, return the voices frames indices
    return_cut: bool
        if True, return the cut points of the audio.

    Return
    ------
    segments: list of audio arrays
    vad (optional): list of 0, 1 for VAD indices
    voices (optional): list of thresholded VAD for more precise voices frames.
    cut (optional): list of indicator 0, 1 (1 for the cut point)

    Note
    ----
    this function does not guarantee the output audio length is always
    smaller than `maximum_duration`, the higher the threshold, the better
    chance you get everything smaller than `maximum_duration`
    """
    frame_length = int(frame_length)
    maximum_duration = maximum_duration * sr
    results = []
    # ====== check if audio long enough ====== #
    if len(s) < maximum_duration:
        if return_cut or return_vad or return_voices:
            raise ValueError(
                "Cannot return `cut` points, `vad` or `voices` since"
                "the original audio is shorter than `maximum_duration`, "
                "hence, no need for splitting.")
        return [s]
    maximum_duration /= frame_length
    if minimum_duration is None:
        minimum_duration = maximum_duration // 2
    else:
        minimum_duration = minimum_duration * sr / frame_length
        minimum_duration = np.clip(minimum_duration, 0.,
                                   0.99 * maximum_duration)
    # ====== start spliting ====== #
    frames = signal.segment_axis(s,
                                 frame_length,
                                 frame_length,
                                 axis=0,
                                 end='pad',
                                 endvalue=0.)
    energy = signal.get_energy(frames, log=True)
    vad = signal.vad_energy(energy, distrib_nb=nb_mixtures, nb_train_it=33)[0]
    vad = signal.smooth(vad, win=frame_length, window='flat')
    # explicitly return VAD
    if return_vad:
        results.append(vad)
    # ====== get all possible sliences ====== #
    # all voice indices
    indices = np.where(
        vad >= np.percentile(vad, q=threshold * 100))[0].tolist()
    if len(vad) - 1 not in indices:
        indices.append(len(vad) - 1)
    # explicitly return voiced frames
    if return_voices:
        tmp = np.zeros(shape=(len(vad), ))
        tmp[indices] = 1
        results.append(tmp)
    # ====== spliting the audio ====== #
    segments = []
    start = 0
    prev_end = 0
    # greedy adding new frames to reach desire maximum length
    for end in indices:
        # over-reach the maximum length
        if end - start > maximum_duration:
            segments.append((start, prev_end))
            start = prev_end
        # exact maximum length
        elif end - start == maximum_duration:
            segments.append((start, end))
            start = end
        prev_end = end
    # if found NO segments just return original file
    if len(segments) == 0:
        return [s]
    # add ending index if necessary
    if indices[-1] != segments[-1][-1]:
        segments.append((start, indices[-1]))
    # re-fining, short segments will be merged into bigger onces
    found_under_length = True
    while found_under_length:
        new_segments = []
        found_under_length = False
        for (s1, e1), (s2, e2) in zip(segments, segments[1:]):
            # merge if length < length_threshold
            if (e1 - s1) < minimum_duration or (e2 - s2) < minimum_duration:
                new_segments.append((s1, e2))
                found_under_length = True
            # keep both of the segments
            else:
                new_segments.append((s1, e1))
                new_segments.append((s2, e2))
        segments = new_segments
    # explicitly return cut points
    if return_cut:
        tmp = np.zeros(shape=(segments[-1][-1] + 1, ))
        for i, j in segments:
            tmp[i] = 1
            tmp[j] = 1
        results.append(tmp)
    # ====== convert everythng to raw signal index ====== #
    segments = [[i * frame_length, j * frame_length] for i, j in segments]
    segments[-1][-1] = s.shape[0]
    # cut segments out of raw audio array
    segments = [s[i:j] for i, j in segments]
    results = [segments] + results
    return results[0] if len(results) == 1 else results