Пример #1
0
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321):
    """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
    # Load dataset
    frame_length = int(utt_length / FRAME_SHIFT)
    ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True)
    X = ds[feat]
    train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()}
    test_indices = {
        name: start_end
        for name, start_end in ds['indices'].items() if name not in TRAIN_DATA
    }
    train_indices, valid_indices = train_valid_test_split(x=list(
        train_indices.items()),
                                                          train=0.9,
                                                          inc_test=False,
                                                          seed=seed)
    all_speakers = sorted(set(TRAIN_DATA.values()))
    n_speakers = max(all_speakers) + 1
    print("#Train files:", ctext(len(train_indices), 'cyan'))
    print("#Valid files:", ctext(len(valid_indices), 'cyan'))
    print("#Test files:", ctext(len(test_indices), 'cyan'))
    print("#Speakers:", ctext(n_speakers, 'cyan'))
    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post',
                             data_idx=0),
        F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0),
        F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
    ]
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=7,
                            buffer_size=12)
    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=2,
                            buffer_size=4)
    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    # ====== cache the test data ====== #
    cache_dat = os.path.join(PATH_EXP,
                             'test_%s_%d.dat' % (feat, int(utt_length)))
    cache_ids = os.path.join(PATH_EXP,
                             'test_%s_%d.ids' % (feat, int(utt_length)))
    # validate cache files
    if os.path.exists(cache_ids):
        with open(cache_ids, 'rb') as f:
            ids = pickle.load(f)
        if len(ids) != len(test_indices):
            os.remove(cache_ids)
            if os.path.exists(cache_dat):
                os.remove(cache_dat)
    elif os.path.exists(cache_dat):
        os.remove(cache_dat)
    # caching
    if not os.path.exists(cache_dat):
        dat = F.MmapData(cache_dat,
                         dtype='float16',
                         shape=(0, frame_length, X.shape[1]))
        ids = {}
        prog = Progbar(target=len(test_indices))
        s = 0
        for name, (start, end) in test_indices.items():
            y = X[start:end]
            y = segment_axis(y,
                             axis=0,
                             frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post')
            dat.append(y)
            # update indices
            ids[name] = (s, s + len(y))
            s += len(y)
            # update progress
            prog.add(1)
        dat.flush()
        dat.close()
        with open(cache_ids, 'wb') as f:
            pickle.dump(ids, f)
    # ====== re-load ====== #
    dat = F.MmapData(cache_dat, read_only=True)
    with open(cache_ids, 'rb') as f:
        ids = pickle.load(f)
    # ====== save some sample ====== #
    sample_path = os.path.join(PATH_EXP,
                               'test_%s_%d.pdf' % (feat, int(utt_length)))
    V.plot_figure(nrow=9, ncol=6)
    for i, (name, (start, end)) in enumerate(
            sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]),
                          k=12,
                          seed=87654321)):
        x = dat[start:end][:].astype('float32')
        ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                                ax=(12, 1, i + 1),
                                title='')
        ax.set_title(name)
    V.plot_save(sample_path)
    return (train_feeder, valid_feeder, ids, dat, all_speakers)
Пример #2
0
def prepare_data(feat, label, utt_length=0.4, for_ivec=False):
    """

  Returns (i-vector)
  ------------------
  ds[feat]
  train_files
  y_train
  test_files
  y_test
  labels

  Returns (x-vector)
  ------------------
  train : Feeder
    feeder for training data for iterating over pair of (X, y)
  valid : Feeder
    feeder for validating data for iterating over pair of (X, y)
  X_test_name : list of file names
    file names are append with '.%d' for cut segment ID
  X_test_true : list of integer
    label of each sample
  X_test_data : array
    list of test data same length as X_test_name
  labels : list of string
    list of labels for classification task

  Example
  -------
  (train, valid,
   X_test_name, X_test_true, X_test_data,
   labels) = prepare_data_dnn(feat=FEAT, label='gender')

  """
    label = str(label).lower()
    assert label in _support_label, "No support for label: %s" % label
    assert 0 < utt_length <= 1.
    # ====== load dataset ====== #
    if not os.path.exists(PATH_ACOUSTIC):
        raise RuntimeError(
            "Cannot find extracted acoustic features at path: '%s',"
            "run the code speech_features_extraction.py!" % PATH_ACOUSTIC)
    ds = F.Dataset(PATH_ACOUSTIC, read_only=True)
    assert feat in ds, "Cannot find feature with name: %s" % feat
    indices = list(ds['indices'].items())
    K.get_rng().shuffle(indices)

    # ====== helper ====== #
    def is_train(x):
        return x.split('_')[0] == 'train'

    def extract_label(x):
        return x.split('_')[_support_label[label]]

    print("Task:", ctext(label, 'cyan'))
    fn_label, labels = unique_labels([i[0] for i in indices],
                                     key_func=extract_label,
                                     return_labels=True)
    print("Labels:", ctext(labels, 'cyan'))
    # ====== training and test data ====== #
    train_files = []  # (name, (start, end)) ...
    test_files = []
    for name, (start, end) in indices:
        if is_train(name):
            train_files.append((name, (start, end)))
        else:
            test_files.append((name, (start, end)))
    # name for each dataset, useful for later
    print("#Train:", ctext(len(train_files), 'cyan'))
    print("#Test:", ctext(len(test_files), 'cyan'))
    # ====== for i-vectors ====== #
    y_train = np.array([fn_label(i[0]) for i in train_files])
    y_test = np.array([fn_label(i[0]) for i in test_files])
    if bool(for_ivec):
        return ds[feat], train_files, y_train, test_files, y_test, labels
    # ====== length ====== #
    length = [(end - start) for _, (start, end) in indices]
    max_length = max(length)
    frame_length = int(max_length * utt_length)
    step_length = frame_length
    print("Max length  :", ctext(max_length, 'yellow'))
    print("Frame length:", ctext(frame_length, 'yellow'))
    print("Step length :", ctext(step_length, 'yellow'))
    # ====== split dataset ====== #
    # split by speaker ID
    train_files, valid_files = train_valid_test_split(
        x=train_files,
        train=0.8,
        cluster_func=None,
        idfunc=lambda x: x[0].split('_')[4],  # splited by speaker
        inc_test=False)
    print("#File train:", ctext(len(train_files), 'cyan'))
    print("#File valid:", ctext(len(valid_files), 'cyan'))
    print("#File test :", ctext(len(test_files), 'cyan'))

    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=step_length,
                             end='pad',
                             pad_mode='post',
                             pad_value=0),
        F.recipes.Name2Label(converter_func=fn_label),
        F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1)
    ]
    feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files),
                            ncpu=6,
                            batch_mode='batch')
    feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files),
                            ncpu=4,
                            batch_mode='batch')
    feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files),
                           ncpu=4,
                           batch_mode='file')
    feeder_train.set_recipes(recipes)
    feeder_valid.set_recipes(recipes)
    feeder_test.set_recipes(recipes)
    print(feeder_train)

    # ====== process X_test, y_test in advance for faster evaluation ====== #
    @cache_disk
    def _extract_test_data(feat, label, utt_length):
        prog = Progbar(target=len(feeder_test),
                       print_summary=True,
                       name="Preprocessing test set")
        X_test = defaultdict(list)
        for name, idx, X, y in feeder_test:
            # validate everything as expected
            assert fn_label(name) == np.argmax(y), name  # label is right
            # save to list
            X_test[name].append((idx, X))
            prog.add(X.shape[0])
        # ====== create 1 array for data and dictionary for indices ====== #
        X_test_name = []
        X_test_data = []
        for name, X in X_test.items():
            X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])],
                               axis=0).astype('float16')
            X_test_name += [name + '.%d' % i for i in range(len(X))]
            X_test_data.append(X)
        X_test_name = np.array(X_test_name)
        X_test_data = np.concatenate(X_test_data, axis=0)
        return X_test_name, X_test_data

    # convert everything back to float32
    X_test_name, X_test_data = _extract_test_data(feat, label, utt_length)
    X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name])
    return feeder_train, feeder_valid, \
    X_test_name, X_test_true, X_test_data, labels
Пример #3
0
print(' * Speakers:', ctext(report_info(4, valid), 'cyan'))
print(ctext("#File test:", 'yellow'), len(test), test[:2])
# ====== create recipe ====== #
recipes = [
    F.recipes.Slice(slices=slice(40), axis=-1, data_idx=0),
    F.recipes.Sequencing(frame_length=max_length,
                         step_length=1,
                         end='pad',
                         pad_mode='post',
                         pad_value=0,
                         data_idx=None),
    F.recipes.Name2Label(converter_func=f_digits),
    F.recipes.LabelOneHot(nb_classes=len(digits), data_idx=-1),
]
data = [ds[f] for f in FEAT]
train = F.Feeder(F.IndexedData(data=data, indices=train),
                 dtype='float32',
                 ncpu=6,
                 buffer_size=len(digits),
                 batch_mode='batch')
valid = F.Feeder(F.IndexedData(data=data, indices=valid),
                 dtype='float32',
                 ncpu=2,
                 buffer_size=len(digits),
                 batch_mode='batch')
test = F.Feeder(F.IndexedData(data=data, indices=test),
                dtype='float32',
                ncpu=1,
                buffer_size=1,
                batch_mode='file')
train.set_recipes(recipes)
Пример #4
0
def prepare_dnn_data(save_dir,
                     feat_name=None,
                     utt_length=None,
                     seq_mode=None,
                     min_dur=None,
                     min_utt=None,
                     exclude=None,
                     train_proportion=None,
                     return_dataset=False):
    assert os.path.isdir(save_dir), \
        "Path to '%s' is not a directory" % save_dir
    if feat_name is None:
        feat_name = FEATURE_NAME
    if utt_length is None:
        utt_length = int(_args.utt)
    if seq_mode is None:
        seq_mode = str(_args.seq).strip().lower()
    if min_dur is None:
        min_dur = MINIMUM_UTT_DURATION
    if min_utt is None:
        min_utt = MINIMUM_UTT_PER_SPEAKERS
    if exclude is None:
        exclude = str(_args.exclude).strip()
    print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan'))
    print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan'))
    # ******************** prepare dataset ******************** #
    path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)
    assert os.path.exists(
        path), "Cannot find acoustic dataset at path: %s" % path
    ds = F.Dataset(path=path, read_only=True)
    rand = np.random.RandomState(seed=Config.SUPER_SEED)
    # ====== find the right feature ====== #
    assert feat_name in ds, "Cannot find feature with name: %s" % feat_name
    X = ds[feat_name]
    ids_name = 'indices_%s' % feat_name
    assert ids_name in ds, "Cannot find indices with name: %s" % ids_name
    # ====== basic path ====== #
    path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl')
    path_train_files = os.path.join(save_dir, 'train_files.pkl')
    path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl')
    # ******************** cannot find cached data ******************** #
    if any(not os.path.exists(p)
           for p in [path_filtered_data, path_train_files, path_speaker_info]):
        # ====== exclude some dataset ====== #
        if len(exclude) > 0:
            exclude_dataset = {i: 1 for i in exclude.split(',')}
            print("* Excluded dataset:", ctext(exclude_dataset, 'cyan'))
            indices = {
                name: (start, end)
                for name, (start, end) in ds[ids_name].items()
                if ds['dsname'][name] not in exclude_dataset
            }
            # special case exclude all the noise data
            if 'noise' in exclude_dataset:
                indices = {
                    name: (start, end)
                    for name, (start, end) in indices.items()
                    if '/' not in name
                }
        else:
            indices = {i: j for i, j in ds[ids_name].items()}
        # ====== down-sampling if necessary ====== #
        if _args.downsample > 1000:
            dataset2name = defaultdict(list)
            # ordering the indices so we sample the same set every time
            for name in sorted(indices.keys()):
                dataset2name[ds['dsname'][name]].append(name)
            n_total_files = len(indices)
            n_sample_files = int(_args.downsample)
            # get the percentage of each dataset
            dataset2per = {
                i: len(j) / n_total_files
                for i, j in dataset2name.items()
            }
            # sampling based on percentage
            _ = {}
            for dsname, flist in dataset2name.items():
                rand.shuffle(flist)
                n_dataset_files = int(dataset2per[dsname] * n_sample_files)
                _.update({i: indices[i] for i in flist[:n_dataset_files]})
            indices = _
        # ====== * filter out "bad" sample ====== #
        indices = filter_utterances(X=X,
                                    indices=indices,
                                    spkid=ds['spkid'],
                                    min_utt=min_utt,
                                    min_dur=min_dur,
                                    remove_min_length=True,
                                    remove_min_uttspk=True,
                                    n_speakers=None,
                                    ncpu=None,
                                    save_path=path_filtered_data)
        # ====== all training file name ====== #
        # modify here to train full dataset
        all_name = sorted(indices.keys())
        rand.shuffle(all_name)
        rand.shuffle(all_name)
        n_files = len(all_name)
        print("#Files:", ctext(n_files, 'cyan'))
        # ====== speaker mapping ====== #
        name2spk = {name: ds['spkid'][name] for name in all_name}
        all_speakers = sorted(set(name2spk.values()))
        spk2label = {spk: i for i, spk in enumerate(all_speakers)}
        name2label = {name: spk2label[spk] for name, spk in name2spk.items()}
        assert len(name2label) == len(all_name)
        print("#Speakers:", ctext(len(all_speakers), 'cyan'))
        # ====== stratify sampling based on speaker ====== #
        valid_name = []
        # create speakers' cluster
        label2name = defaultdict(list)
        for name, label in sorted(name2label.items(), key=lambda x: x[0]):
            label2name[label].append(name)
        # for each speaker with >= 3 utterance
        for label, name_list in sorted(label2name.items(), key=lambda x: x[0]):
            if len(name_list) < 3:
                continue
            n = max(1, int(0.05 * len(name_list)))  # 5% for validation
            valid_name += rand.choice(a=name_list, size=n,
                                      replace=False).tolist()
        # train list is the rest
        _ = set(valid_name)
        train_name = [i for i in all_name if i not in _]
        # ====== split training and validation ====== #
        train_indices = {name: indices[name] for name in train_name}
        valid_indices = {name: indices[name] for name in valid_name}
        # ====== save cached data ====== #
        with open(path_train_files, 'wb') as fout:
            pickle.dump({'train': train_indices, 'valid': valid_indices}, fout)
        with open(path_speaker_info, 'wb') as fout:
            pickle.dump(
                {
                    'all_speakers': all_speakers,
                    'name2label': name2label,
                    'spk2label': spk2label
                }, fout)
    # ******************** load cached data ******************** #
    else:
        with open(path_train_files, 'rb') as fin:
            obj = pickle.load(fin)
            train_indices = obj['train']
            valid_indices = obj['valid']
        with open(path_speaker_info, 'rb') as fin:
            obj = pickle.load(fin)
            all_speakers = obj['all_speakers']
            name2label = obj['name2label']
            spk2label = obj['spk2label']

    # ******************** print log ******************** #

    def summary_indices(ids):
        datasets = defaultdict(int)
        speakers = defaultdict(list)
        text = ''
        for name in sorted(ids.keys()):
            text += name + str(ids[name])
            dsname = ds['dsname'][name]
            datasets[dsname] += 1
            speakers[dsname].append(ds['spkid'][name])
        for dsname in sorted(datasets.keys()):
            print('  %-18s: %s(utt) %s(spk)' %
                  (dsname, ctext('%6d' % datasets[dsname], 'cyan'),
                   ctext(len(set(speakers[dsname])), 'cyan')))
        print('  MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan'))

    # ====== training files ====== #
    print(
        "#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:",
        ctext(len(set(name2label[name] for name in train_indices.keys())),
              'cyan'), "#noise:",
        ctext(len([name for name in train_indices.keys() if '/' in name]),
              'cyan'))
    summary_indices(ids=train_indices)
    # ====== valid files ====== #
    print(
        "#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:",
        ctext(len(set(name2label[name] for name in valid_indices.keys())),
              'cyan'), "#noise:",
        ctext(len([name for name in valid_indices.keys() if '/' in name]),
              'cyan'))
    summary_indices(ids=valid_indices)
    # ******************** create the recipe ******************** #
    assert all(name in name2label for name in train_indices.keys())
    assert all(name in name2label for name in valid_indices.keys())
    recipes = prepare_dnn_feeder_recipe(name2label=name2label,
                                        n_speakers=len(all_speakers),
                                        utt_length=utt_length,
                                        seq_mode=seq_mode)
    # ====== downsample training set for analyzing if required ====== #
    if train_proportion is not None:
        assert 0 < train_proportion < 1
        n_training = len(train_indices)
        train_indices = list(train_indices.items())
        rand.shuffle(train_indices)
        rand.shuffle(train_indices)
        train_indices = dict(train_indices[:int(n_training *
                                                train_proportion)])
    # ====== create feeder ====== #
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=NCPU,
                            buffer_size=256)

    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=max(2, NCPU // 4),
                            buffer_size=64)

    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    print(valid_feeder)
    # ====== debugging ====== #
    if IS_DEBUGGING:
        import matplotlib
        matplotlib.use('Agg')
        prog = Progbar(target=len(valid_feeder),
                       print_summary=True,
                       name="Iterating validation set")
        samples = []
        n_visual = 250
        for name, idx, X, y in valid_feeder.set_batch(batch_size=100000,
                                                      batch_mode='file',
                                                      seed=None,
                                                      shuffle_level=0):
            assert idx == 0, "Utterances longer than %.2f(sec)" % (
                100000 * Config.STEP_LENGTH)
            prog['X'] = X.shape
            prog['y'] = y.shape
            prog.add(X.shape[0])
            # random sampling
            if rand.rand(1) < 0.5 and len(samples) < n_visual:
                for i in rand.randint(0, X.shape[0], size=4, dtype='int32'):
                    samples.append((name, X[i], np.argmax(y[i], axis=-1)))
        # plot the spectrogram
        n_visual = len(samples)
        V.plot_figure(nrow=n_visual, ncol=8)
        for i, (name, X, y) in enumerate(samples):
            is_noise = '/' in name
            assert name2label[
                name] == y, "Speaker label mismatch for file: %s" % name
            name = name.split('/')[0]
            dsname = ds['dsname'][name]
            spkid = ds['spkid'][name]
            y = np.argmax(y, axis=-1)
            ax = V.plot_spectrogram(X.T,
                                    ax=(n_visual, 1, i + 1),
                                    title='#%d' % (i + 1))
            ax.set_title(
                '[%s][%s]%s  %s' %
                ('noise' if is_noise else 'clean', dsname, name, spkid),
                fontsize=6)
        # don't need to be high resolutions
        V.plot_save('/tmp/tmp.pdf', dpi=12)
        exit()
    # ====== return ====== #
    if bool(return_dataset):
        return train_feeder, valid_feeder, all_speakers, ds
    return train_feeder, valid_feeder, all_speakers
Пример #5
0
 if os.path.exists(vector_outpath):
     with open(vector_outpath, 'rb') as f:
         vectors = pickle.load(f)
         if (len(vectors['name']) == len(vectors['y']) == len(
                 vectors['path']) == len(vectors['X']) <= n_files):
             all_vectors[dsname] = vectors
             print(' - Loaded vectors at:',
                   ctext(vector_outpath, 'yellow'))
             if len(vectors['name']) != n_files:
                 print(
                     '    [WARNING] Extracted scores only for: %s/%s (files)'
                     % (ctext(len(vectors['name']),
                              'lightcyan'), ctext(n_files, 'cyan')))
             continue  # skip the calculation
 # ====== create feeder ====== #
 feeder = F.Feeder(data_desc=F.IndexedData(data=ds_feat,
                                           indices=ds_indices),
                   batch_mode='file',
                   ncpu=8)
 feeder.set_recipes(recipe)
 # ====== init ====== #
 output_name = []
 output_meta = []
 output_path = []
 output_data = []
 # progress bar
 prog = Progbar(target=len(feeder),
                print_summary=True,
                name='Extract vectors: %s' % dsname)
 # ====== make prediction ====== #
 for batch_idx, (name, idx, X) in enumerate(
         feeder.set_batch(batch_size=100000, seed=None,