Пример #1
0
def validating_noise_data(in_path_raw):
    # preparing
    noise_dataset = ['musan', 'rirs']
    all_files = defaultdict(list)
    n_files = sum(
        len(sre_file_list[i]) for i in noise_dataset if i in sre_file_list)
    n_non_exist = 0
    n_exist = 0
    prog = Progbar(target=n_files,
                   print_summary=True,
                   name="Validating noise dataset")
    prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1])
    prog.set_summarizer(key='#Exist', fn=lambda x: x[-1])
    # check all dataset
    for ds_name in noise_dataset:
        if ds_name not in sre_file_list:
            continue
        if ds_name not in in_path_raw:
            continue
        base_path = in_path_raw[ds_name]
        base_ds = all_files[ds_name]
        # start validating
        for row in sre_file_list[ds_name]:
            # check file
            path, channel, name, noise_type, duration = row[:5]
            path = os.path.join(base_path, path)
            if os.path.exists(path):
                base_ds.append([path, channel, name, noise_type, duration])
                n_exist += 1
            else:
                n_non_exist += 1
            # update progress
            prog['ds'] = ds_name
            prog['#Exist'] = n_exist
            prog['#Non-exist'] = n_non_exist
            prog.add(1)
    # ====== return ====== #
    # Header:
    #  0       1      2         3         4
    # path, channel, name, noise_type, duration
    return {
        key: np.array(sorted(val, key=lambda x: x[0]))
        for key, val in all_files.items()
    }
Пример #2
0
def validating_noise_data(in_path_raw):
  # preparing
  noise_dataset = ['musan', 'rirs']
  all_files = defaultdict(list)
  n_files = sum(len(sre_file_list[i])
                for i in noise_dataset
                if i in sre_file_list)
  n_non_exist = 0
  n_exist = 0
  prog = Progbar(target=n_files, print_summary=True,
                 name="Validating noise dataset")
  prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1])
  prog.set_summarizer(key='#Exist', fn=lambda x: x[-1])
  # check all dataset
  for ds_name in noise_dataset:
    if ds_name not in sre_file_list:
      continue
    if ds_name not in in_path_raw:
      continue
    base_path = in_path_raw[ds_name]
    base_ds = all_files[ds_name]
    # start validating
    for row in sre_file_list[ds_name]:
      # check file
      path, channel, name, noise_type, duration = row[:5]
      path = os.path.join(base_path, path)
      if os.path.exists(path):
        base_ds.append([path, channel, name, noise_type, duration])
        n_exist += 1
      else:
        n_non_exist += 1
      # update progress
      prog['ds'] = ds_name
      prog['#Exist'] = n_exist
      prog['#Non-exist'] = n_non_exist
      prog.add(1)
  # ====== return ====== #
  # Header:
  #  0       1      2         3         4
  # path, channel, name, noise_type, duration
  return {key: np.array(sorted(val, key=lambda x: x[0]))
          for key, val in all_files.items()}
Пример #3
0
def filter_utterances(X,
                      indices,
                      spkid,
                      min_dur=None,
                      min_utt=None,
                      remove_min_length=True,
                      remove_min_uttspk=True,
                      n_speakers=None,
                      ncpu=None,
                      save_path=None,
                      title=''):
    """
  X : 2-D matrix
    input features

  indices : Mapping
    utterance_name -> (start, end) in `X`

  spkid : Mapping
    utterance_name -> speaker_id

  remove_min_length : bool (default: True)
    if True, remove all files shorter than MINIMUM_UTT_DURATION

  remove_min_uttspk : bool (default: True)
    if True, remove all speakers with lower amount of utterances than
    MINIMUM_UTT_PER_SPEAKERS

  n_speakers : {None, int} (default: None)
    if given, downsample the dataset by given number of speakers

  save_path : {None, str} (default: None)
    if given, pickle all filtered files to disk

  """
    if min_dur is None:
        min_dur = MINIMUM_UTT_DURATION
    if min_utt is None:
        min_utt = MINIMUM_UTT_PER_SPEAKERS

    minimum_amount_of_frames = min_dur / Config.STEP_LENGTH
    save_data = {}

    prog = Progbar(target=len(indices),
                   print_report=True,
                   print_summary=True,
                   name='Filtering broken utterances: %s' % title)
    prog.set_summarizer('zero-length', fn=lambda x: x[-1])
    prog.set_summarizer('min-frames', fn=lambda x: x[-1])
    prog.set_summarizer('zero-var', fn=lambda x: x[-1])
    prog.set_summarizer('small-var', fn=lambda x: x[-1])
    prog.set_summarizer('overflow', fn=lambda x: x[-1])

    # ====== mpi function for checking ====== #
    @nb.jit(nopython=True, nogil=True)
    def _fast_mean_var_ax0(z):
        # using this function for calculating mean and variance
        # can double the speed but cannot check overflow,
        # only accept float32 or float64 input
        s1 = np.zeros(shape=(z.shape[1], ), dtype=z.dtype)
        s2 = np.zeros(shape=(z.shape[1], ), dtype=z.dtype)
        for i in range(z.shape[0]):
            s1 += z[i]
            s2 += np.power(z[i], 2)
        mean = s1 / z.shape[0]
        var = s2 / z.shape[0] - np.power(mean, 2)
        return mean, var

    def _mpi_func(jobs):
        for name, (start, end) in jobs:
            y = X[start:end]
            # flags
            is_zero_len = False
            is_zero_var = False
            is_small_var = False
            is_min_frames = False
            is_overflow = False
            # checking length
            if y.shape[0] == 0:
                is_zero_len = True
            elif y.shape[0] < minimum_amount_of_frames:
                is_min_frames = True
            # checking statistics
            else:
                with catch_warnings_error(RuntimeWarning):
                    try:
                        # mean = np.mean(y, axis=-1)
                        var = np.var(y, axis=-1)
                        # min_val = np.min(y, axis=-1)
                        # max_val = np.max(y, axis=-1)
                    # numerical unstable
                    except RuntimeWarning as w:
                        if 'overflow encountered' in str(w):
                            is_overflow = True
                        else:
                            print(name, ':', w)
                    # process with more numerical filtering
                    else:
                        if np.any(np.isclose(var, 0)):
                            is_zero_var = True
                        # very heuristic and aggressive here
                        # filter-out anything with ~16.67% of low-var
                        # this could remove 1/3 of the original data
                        if np.sum(var < 0.01) > (len(y) / 6):
                            is_small_var = True
            # return the flags
            yield (name, is_zero_len, is_min_frames, is_zero_var, is_small_var,
                   is_overflow)

    # ====== running the multiprocessing filter ====== #
    zero_len_files = {}
    min_frame_files = {}
    zero_var_files = {}
    small_var_files = {}
    overflow_files = {}
    for res in mpi.MPI(jobs=sorted(indices.items(), key=lambda x: x[1][0]),
                       func=_mpi_func,
                       ncpu=NCPU if ncpu is None else int(ncpu),
                       batch=250):
        name = res[0]
        if res[1]: zero_len_files[name] = 1
        if res[2]: min_frame_files[name] = 1
        if res[3]: zero_var_files[name] = 1
        if res[4]: small_var_files[name] = 1
        if res[5]: overflow_files[name] = 1
        # update progress
        prog['name'] = name[:48]
        prog['zero-length'] = len(zero_len_files)
        prog['min-frames'] = len(min_frame_files)
        prog['zero-var'] = len(zero_var_files)
        prog['small-var'] = len(small_var_files)
        prog['overflow'] = len(overflow_files)
        prog.add(1)
    # ====== remove broken files ====== #
    if not bool(remove_min_length):
        min_frame_files = {}
    new_indices = {
        name: (start, end)
        for name, (start, end) in indices.items() if name not in zero_len_files
        and name not in min_frame_files and name not in zero_var_files
        and name not in small_var_files and name not in overflow_files
    }
    print("Filtered #utterances: %s/%s (files)" %
          (ctext(len(indices) - len(new_indices),
                 'lightcyan'), ctext(len(indices), 'cyan')))
    indices = new_indices
    # ====== store save data ====== #
    save_data['zero_len'] = zero_len_files
    save_data['min_dur'] = min_frame_files
    save_data['zero_var'] = zero_var_files
    save_data['small_var'] = small_var_files
    save_data['overflow'] = overflow_files
    # ====== filter-out by number of utt-per-speaker ====== #
    if bool(remove_min_uttspk):
        spk2utt = defaultdict(list)
        for name in indices.keys():
            spk2utt[spkid[name]].append(name)

        n_utt_removed = 0
        n_spk_removed = 0
        removed_utt = []
        keep_utt = []
        for spk, utt in spk2utt.items():
            if len(utt) < min_utt:
                n_utt_removed += len(utt)
                n_spk_removed += 1
                removed_utt += utt
            else:
                keep_utt += utt

        removed_utt = set(removed_utt)
        keep_utt = set(keep_utt)
        save_data['min_utt'] = removed_utt

        print("Removed min-utt/spk:  %s/%s(utt)  %s/%s(spk)" %
              (ctext(n_utt_removed, 'lightcyan'), ctext(len(indices), 'cyan'),
               ctext(n_spk_removed, 'lightcyan'), ctext(len(spk2utt), 'cyan')))
        assert len(indices) == n_utt_removed + len(keep_utt), "Not possible!"

        indices = {
            name: (start, end)
            for name, (start, end) in indices.items() if name in keep_utt
        }
    # ====== sample by number of speakers ====== #
    if isinstance(n_speakers, Number) and n_speakers > 0:
        spk2utt = defaultdict(list)
        for name, (start, end) in indices.items():
            spk2utt[spkid[name]].append((name, (start, end)))

        n_org_spk = len(spk2utt)
        n_org_ids = len(indices)
        # only need down-sampling with smaller number of speaker
        if n_speakers < n_org_spk:
            rand = np.random.RandomState(seed=Config.SUPER_SEED)
            tmp = list(spk2utt.keys())
            rand.shuffle(tmp)
            sampled_spk = tmp[:n_speakers]

            indices = []
            for spk in sampled_spk:
                indices += spk2utt[spk]
            indices = dict(indices)
        else:
            sampled_spk = spk2utt
        # print some log
        print("Selected: %s/%s(spk) which have %s/%s(utt)" %
              (ctext(len(sampled_spk), 'lightcyan'), ctext(n_org_spk, 'cyan'),
               ctext(len(indices), 'lightcyan'), ctext(n_org_ids, 'cyan')))
    # ====== return the new indices ====== #
    if save_path is not None:
        try:
            with open(save_path, 'wb') as save_file:
                pickle.dump(save_data, save_file)
        except Exception as e:
            print("Cannot save filtering data to path: '%s', error: '%s'" %
                  (save_path, str(e)))
    return indices
Пример #4
0
def validating_training_data(in_path_raw, training_dataset):
    file_list = {
        ds: sre_file_list[ds]
        for ds in training_dataset if ds in sre_file_list
    }
    # ====== meta info ====== #
    all_files = []
    non_exist_files = []
    extension_count = defaultdict(int)
    total_data = sum(v.shape[0] for k, v in file_list.items()
                     if k not in ('musan', 'rirs'))
    # ====== progress ====== #
    prog = Progbar(target=total_data,
                   print_summary=True,
                   print_report=True,
                   name="Preprocessing File List")
    prog.set_summarizer('#Files', fn=lambda x: x[-1])
    prog.set_summarizer('#Non-exist', fn=lambda x: x[-1])
    # ====== iterating ====== #
    for ds_name, data in sorted(file_list.items(), key=lambda x: x[0]):
        if ds_name in ('musan', 'rirs'):
            continue
        for row in data:
            path, channel, name, spkid = row[:4]
            assert channel in ('0', '1')
            # check path provided
            if ds_name in in_path_raw:
                path = os.path.join(in_path_raw[ds_name], path)
            # create new row
            start_time = '-'
            end_time = '-'
            if ds_name == 'mx6':
                start_time, end_time = row[-2:]
            new_row = [
                path, channel, name, ds_name + '_' + spkid, ds_name,
                start_time, end_time
            ]
            # check file exist
            if os.path.exists(path):
                all_files.append(new_row)
            else:
                non_exist_files.append(new_row)
            # extension
            ext = os.path.splitext(path)[-1]
            extension_count[ext + '-' + ds_name] += 1
            # update progress
            prog['Dataset'] = ds_name
            prog['#Files'] = len(all_files)
            prog['#Non-exist'] = len(non_exist_files)
            prog.add(1)
    # final results
    all_files = np.array(all_files)
    if len(all_files) == 0:
        return all_files, np.array(non_exist_files), extension_count
    # ====== check no duplicated name ====== #
    n_files = len(all_files)
    n_unique_files = len(np.unique(all_files[:, 2]))
    assert n_files == n_unique_files, \
    'Found duplicated name: %d != %d' % (n_files, n_unique_files)
    # ====== check no duplicated speaker ====== #
    n_spk = sum(
        len(np.unique(dat[:, 3])) for name, dat in file_list.items()
        if name not in ('musan', 'rirs'))
    n_unique_spk = len(np.unique(all_files[:, 3]))
    assert n_spk == n_unique_spk, \
    'Found duplicated speakers: %d != %d' % (n_spk, n_unique_spk)
    # ====== return ====== #
    # Header:
    #  0       1      2      3       4          5         6
    # path, channel, name, spkid, dataset, start_time, end_time
    return all_files, np.array(non_exist_files), extension_count
Пример #5
0
def filter_utterances(X, indices, spkid,
                      min_dur=None, min_utt=None,
                      remove_min_length=True, remove_min_uttspk=True,
                      n_speakers=None, ncpu=None, save_path=None,
                      title=''):
  """
  X : 2-D matrix
    input features

  indices : Mapping
    utterance_name -> (start, end) in `X`

  spkid : Mapping
    utterance_name -> speaker_id

  remove_min_length : bool (default: True)
    if True, remove all files shorter than MINIMUM_UTT_DURATION

  remove_min_uttspk : bool (default: True)
    if True, remove all speakers with lower amount of utterances than
    MINIMUM_UTT_PER_SPEAKERS

  n_speakers : {None, int} (default: None)
    if given, downsample the dataset by given number of speakers

  save_path : {None, str} (default: None)
    if given, pickle all filtered files to disk

  """
  if min_dur is None:
    min_dur = MINIMUM_UTT_DURATION
  if min_utt is None:
    min_utt = MINIMUM_UTT_PER_SPEAKERS

  minimum_amount_of_frames = min_dur / Config.STEP_LENGTH
  save_data = {}

  prog = Progbar(target=len(indices),
                 print_report=True, print_summary=True,
                 name='Filtering broken utterances: %s' % title)
  prog.set_summarizer('zero-length', fn=lambda x: x[-1])
  prog.set_summarizer('min-frames', fn=lambda x: x[-1])
  prog.set_summarizer('zero-var', fn=lambda x: x[-1])
  prog.set_summarizer('small-var', fn=lambda x: x[-1])
  prog.set_summarizer('overflow', fn=lambda x: x[-1])

  # ====== mpi function for checking ====== #
  @nb.jit(nopython=True, nogil=True)
  def _fast_mean_var_ax0(z):
    # using this function for calculating mean and variance
    # can double the speed but cannot check overflow,
    # only accept float32 or float64 input
    s1 = np.zeros(shape=(z.shape[1],), dtype=z.dtype)
    s2 = np.zeros(shape=(z.shape[1],), dtype=z.dtype)
    for i in range(z.shape[0]):
      s1 += z[i]
      s2 += np.power(z[i], 2)
    mean = s1 / z.shape[0]
    var = s2 / z.shape[0] - np.power(mean, 2)
    return mean, var

  def _mpi_func(jobs):
    for name, (start, end) in jobs:
      y = X[start:end]
      # flags
      is_zero_len = False
      is_zero_var = False
      is_small_var = False
      is_min_frames = False
      is_overflow = False
      # checking length
      if y.shape[0] == 0:
        is_zero_len = True
      elif y.shape[0] < minimum_amount_of_frames:
        is_min_frames = True
      # checking statistics
      else:
        with catch_warnings_error(RuntimeWarning):
          try:
            # mean = np.mean(y, axis=-1)
            var = np.var(y, axis=-1)
            # min_val = np.min(y, axis=-1)
            # max_val = np.max(y, axis=-1)
          # numerical unstable
          except RuntimeWarning as w:
            if 'overflow encountered' in str(w):
              is_overflow = True
            else:
              print(name, ':', w)
          # process with more numerical filtering
          else:
            if np.any(np.isclose(var, 0)):
              is_zero_var = True
            # very heuristic and aggressive here
            # filter-out anything with ~16.67% of low-var
            # this could remove 1/3 of the original data
            if np.sum(var < 0.01) > (len(y) / 6):
              is_small_var = True
      # return the flags
      yield (name, is_zero_len, is_min_frames,
             is_zero_var, is_small_var,
             is_overflow)
  # ====== running the multiprocessing filter ====== #
  zero_len_files = {}
  min_frame_files = {}
  zero_var_files = {}
  small_var_files = {}
  overflow_files = {}
  for res in mpi.MPI(jobs=sorted(indices.items(),
                                 key=lambda x: x[1][0]),
                     func=_mpi_func,
                     ncpu=NCPU if ncpu is None else int(ncpu),
                     batch=250):
    name = res[0]
    if res[1]: zero_len_files[name] = 1
    if res[2]: min_frame_files[name] = 1
    if res[3]: zero_var_files[name] = 1
    if res[4]: small_var_files[name] = 1
    if res[5]: overflow_files[name] = 1
    # update progress
    prog['name'] = name[:48]
    prog['zero-length'] = len(zero_len_files)
    prog['min-frames'] = len(min_frame_files)
    prog['zero-var'] = len(zero_var_files)
    prog['small-var'] = len(small_var_files)
    prog['overflow'] = len(overflow_files)
    prog.add(1)
  # ====== remove broken files ====== #
  if not bool(remove_min_length):
    min_frame_files = {}
  new_indices = {name: (start, end)
                 for name, (start, end) in indices.items()
                 if name not in zero_len_files and
                 name not in min_frame_files and
                 name not in zero_var_files and
                 name not in small_var_files and
                 name not in overflow_files}
  print("Filtered #utterances: %s/%s (files)" %
    (ctext(len(indices) - len(new_indices), 'lightcyan'),
     ctext(len(indices), 'cyan')))
  indices = new_indices
  # ====== store save data ====== #
  save_data['zero_len'] = zero_len_files
  save_data['min_dur'] = min_frame_files
  save_data['zero_var'] = zero_var_files
  save_data['small_var'] = small_var_files
  save_data['overflow'] = overflow_files
  # ====== filter-out by number of utt-per-speaker ====== #
  if bool(remove_min_uttspk):
    spk2utt = defaultdict(list)
    for name in indices.keys():
      spk2utt[spkid[name]].append(name)

    n_utt_removed = 0
    n_spk_removed = 0
    removed_utt = []
    keep_utt = []
    for spk, utt in spk2utt.items():
      if len(utt) < min_utt:
        n_utt_removed += len(utt)
        n_spk_removed += 1
        removed_utt += utt
      else:
        keep_utt += utt

    removed_utt = set(removed_utt)
    keep_utt = set(keep_utt)
    save_data['min_utt'] = removed_utt

    print("Removed min-utt/spk:  %s/%s(utt)  %s/%s(spk)" % (
        ctext(n_utt_removed, 'lightcyan'), ctext(len(indices), 'cyan'),
        ctext(n_spk_removed, 'lightcyan'), ctext(len(spk2utt), 'cyan')
    ))
    assert len(indices) == n_utt_removed + len(keep_utt), "Not possible!"

    indices = {name: (start, end)
               for name, (start, end) in indices.items()
               if name in keep_utt}
  # ====== sample by number of speakers ====== #
  if isinstance(n_speakers, Number) and n_speakers > 0:
    spk2utt = defaultdict(list)
    for name, (start, end) in indices.items():
      spk2utt[spkid[name]].append((name, (start, end)))

    n_org_spk = len(spk2utt)
    n_org_ids = len(indices)
    # only need down-sampling with smaller number of speaker
    if n_speakers < n_org_spk:
      rand = np.random.RandomState(seed=Config.SUPER_SEED)
      tmp = list(spk2utt.keys())
      rand.shuffle(tmp)
      sampled_spk = tmp[:n_speakers]

      indices = []
      for spk in sampled_spk:
        indices += spk2utt[spk]
      indices = dict(indices)
    else:
      sampled_spk = spk2utt
    # print some log
    print("Selected: %s/%s(spk) which have %s/%s(utt)" % (
        ctext(len(sampled_spk), 'lightcyan'), ctext(n_org_spk, 'cyan'),
        ctext(len(indices), 'lightcyan'), ctext(n_org_ids, 'cyan')
    ))
  # ====== return the new indices ====== #
  if save_path is not None:
    try:
      with open(save_path, 'wb') as save_file:
        pickle.dump(save_data, save_file)
    except Exception as e:
      print("Cannot save filtering data to path: '%s', error: '%s'" %
        (save_path, str(e)))
  return indices
Пример #6
0
def validating_training_data(in_path_raw, training_dataset):
  file_list = {ds: sre_file_list[ds]
               for ds in training_dataset
               if ds in sre_file_list}
  # ====== meta info ====== #
  all_files = []
  non_exist_files = []
  extension_count = defaultdict(int)
  total_data = sum(v.shape[0]
                   for k, v in file_list.items()
                   if k not in('musan', 'rirs'))
  # ====== progress ====== #
  prog = Progbar(target=total_data,
                 print_summary=True, print_report=True,
                 name="Preprocessing File List")
  prog.set_summarizer('#Files', fn=lambda x: x[-1])
  prog.set_summarizer('#Non-exist', fn=lambda x: x[-1])
  # ====== iterating ====== #
  for ds_name, data in sorted(file_list.items(),
                              key=lambda x: x[0]):
    if ds_name in ('musan', 'rirs'):
      continue
    for row in data:
      path, channel, name, spkid = row[:4]
      assert channel in ('0', '1')
      # check path provided
      if ds_name in in_path_raw:
        path = os.path.join(in_path_raw[ds_name], path)
      # create new row
      start_time = '-'
      end_time = '-'
      if ds_name == 'mx6':
        start_time, end_time = row[-2:]
      new_row = [path, channel, name,
                 ds_name + '_' + spkid, ds_name,
                 start_time, end_time]
      # check file exist
      if os.path.exists(path):
        all_files.append(new_row)
      else:
        non_exist_files.append(new_row)
      # extension
      ext = os.path.splitext(path)[-1]
      extension_count[ext + '-' + ds_name] += 1
      # update progress
      prog['Dataset'] = ds_name
      prog['#Files'] = len(all_files)
      prog['#Non-exist'] = len(non_exist_files)
      prog.add(1)
  # final results
  all_files = np.array(all_files)
  if len(all_files) == 0:
    return all_files, np.array(non_exist_files), extension_count
  # ====== check no duplicated name ====== #
  n_files = len(all_files)
  n_unique_files = len(np.unique(all_files[:, 2]))
  assert n_files == n_unique_files, \
  'Found duplicated name: %d != %d' % (n_files, n_unique_files)
  # ====== check no duplicated speaker ====== #
  n_spk = sum(len(np.unique(dat[:, 3]))
              for name, dat in file_list.items()
              if name not in ('musan', 'rirs'))
  n_unique_spk = len(np.unique(all_files[:, 3]))
  assert n_spk == n_unique_spk, \
  'Found duplicated speakers: %d != %d' % (n_spk, n_unique_spk)
  # ====== return ====== #
  # Header:
  #  0       1      2      3       4          5         6
  # path, channel, name, spkid, dataset, start_time, end_time
  return all_files, np.array(non_exist_files), extension_count