def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def prepare_data(feat, label, utt_length=0.4, for_ivec=False): """ Returns (i-vector) ------------------ ds[feat] train_files y_train test_files y_test labels Returns (x-vector) ------------------ train : Feeder feeder for training data for iterating over pair of (X, y) valid : Feeder feeder for validating data for iterating over pair of (X, y) X_test_name : list of file names file names are append with '.%d' for cut segment ID X_test_true : list of integer label of each sample X_test_data : array list of test data same length as X_test_name labels : list of string list of labels for classification task Example ------- (train, valid, X_test_name, X_test_true, X_test_data, labels) = prepare_data_dnn(feat=FEAT, label='gender') """ label = str(label).lower() assert label in _support_label, "No support for label: %s" % label assert 0 < utt_length <= 1. # ====== load dataset ====== # if not os.path.exists(PATH_ACOUSTIC): raise RuntimeError( "Cannot find extracted acoustic features at path: '%s'," "run the code speech_features_extraction.py!" % PATH_ACOUSTIC) ds = F.Dataset(PATH_ACOUSTIC, read_only=True) assert feat in ds, "Cannot find feature with name: %s" % feat indices = list(ds['indices'].items()) K.get_rng().shuffle(indices) # ====== helper ====== # def is_train(x): return x.split('_')[0] == 'train' def extract_label(x): return x.split('_')[_support_label[label]] print("Task:", ctext(label, 'cyan')) fn_label, labels = unique_labels([i[0] for i in indices], key_func=extract_label, return_labels=True) print("Labels:", ctext(labels, 'cyan')) # ====== training and test data ====== # train_files = [] # (name, (start, end)) ... test_files = [] for name, (start, end) in indices: if is_train(name): train_files.append((name, (start, end))) else: test_files.append((name, (start, end))) # name for each dataset, useful for later print("#Train:", ctext(len(train_files), 'cyan')) print("#Test:", ctext(len(test_files), 'cyan')) # ====== for i-vectors ====== # y_train = np.array([fn_label(i[0]) for i in train_files]) y_test = np.array([fn_label(i[0]) for i in test_files]) if bool(for_ivec): return ds[feat], train_files, y_train, test_files, y_test, labels # ====== length ====== # length = [(end - start) for _, (start, end) in indices] max_length = max(length) frame_length = int(max_length * utt_length) step_length = frame_length print("Max length :", ctext(max_length, 'yellow')) print("Frame length:", ctext(frame_length, 'yellow')) print("Step length :", ctext(step_length, 'yellow')) # ====== split dataset ====== # # split by speaker ID train_files, valid_files = train_valid_test_split( x=train_files, train=0.8, cluster_func=None, idfunc=lambda x: x[0].split('_')[4], # splited by speaker inc_test=False) print("#File train:", ctext(len(train_files), 'cyan')) print("#File valid:", ctext(len(valid_files), 'cyan')) print("#File test :", ctext(len(test_files), 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=step_length, end='pad', pad_mode='post', pad_value=0), F.recipes.Name2Label(converter_func=fn_label), F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1) ] feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files), ncpu=6, batch_mode='batch') feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files), ncpu=4, batch_mode='batch') feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files), ncpu=4, batch_mode='file') feeder_train.set_recipes(recipes) feeder_valid.set_recipes(recipes) feeder_test.set_recipes(recipes) print(feeder_train) # ====== process X_test, y_test in advance for faster evaluation ====== # @cache_disk def _extract_test_data(feat, label, utt_length): prog = Progbar(target=len(feeder_test), print_summary=True, name="Preprocessing test set") X_test = defaultdict(list) for name, idx, X, y in feeder_test: # validate everything as expected assert fn_label(name) == np.argmax(y), name # label is right # save to list X_test[name].append((idx, X)) prog.add(X.shape[0]) # ====== create 1 array for data and dictionary for indices ====== # X_test_name = [] X_test_data = [] for name, X in X_test.items(): X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])], axis=0).astype('float16') X_test_name += [name + '.%d' % i for i in range(len(X))] X_test_data.append(X) X_test_name = np.array(X_test_name) X_test_data = np.concatenate(X_test_data, axis=0) return X_test_name, X_test_data # convert everything back to float32 X_test_name, X_test_data = _extract_test_data(feat, label, utt_length) X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name]) return feeder_train, feeder_valid, \ X_test_name, X_test_true, X_test_data, labels
print(' * Speakers:', ctext(report_info(4, valid), 'cyan')) print(ctext("#File test:", 'yellow'), len(test), test[:2]) # ====== create recipe ====== # recipes = [ F.recipes.Slice(slices=slice(40), axis=-1, data_idx=0), F.recipes.Sequencing(frame_length=max_length, step_length=1, end='pad', pad_mode='post', pad_value=0, data_idx=None), F.recipes.Name2Label(converter_func=f_digits), F.recipes.LabelOneHot(nb_classes=len(digits), data_idx=-1), ] data = [ds[f] for f in FEAT] train = F.Feeder(F.IndexedData(data=data, indices=train), dtype='float32', ncpu=6, buffer_size=len(digits), batch_mode='batch') valid = F.Feeder(F.IndexedData(data=data, indices=valid), dtype='float32', ncpu=2, buffer_size=len(digits), batch_mode='batch') test = F.Feeder(F.IndexedData(data=data, indices=test), dtype='float32', ncpu=1, buffer_size=1, batch_mode='file') train.set_recipes(recipes)
def prepare_dnn_data(save_dir, feat_name=None, utt_length=None, seq_mode=None, min_dur=None, min_utt=None, exclude=None, train_proportion=None, return_dataset=False): assert os.path.isdir(save_dir), \ "Path to '%s' is not a directory" % save_dir if feat_name is None: feat_name = FEATURE_NAME if utt_length is None: utt_length = int(_args.utt) if seq_mode is None: seq_mode = str(_args.seq).strip().lower() if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS if exclude is None: exclude = str(_args.exclude).strip() print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan')) print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan')) # ******************** prepare dataset ******************** # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists( path), "Cannot find acoustic dataset at path: %s" % path ds = F.Dataset(path=path, read_only=True) rand = np.random.RandomState(seed=Config.SUPER_SEED) # ====== find the right feature ====== # assert feat_name in ds, "Cannot find feature with name: %s" % feat_name X = ds[feat_name] ids_name = 'indices_%s' % feat_name assert ids_name in ds, "Cannot find indices with name: %s" % ids_name # ====== basic path ====== # path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl') path_train_files = os.path.join(save_dir, 'train_files.pkl') path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl') # ******************** cannot find cached data ******************** # if any(not os.path.exists(p) for p in [path_filtered_data, path_train_files, path_speaker_info]): # ====== exclude some dataset ====== # if len(exclude) > 0: exclude_dataset = {i: 1 for i in exclude.split(',')} print("* Excluded dataset:", ctext(exclude_dataset, 'cyan')) indices = { name: (start, end) for name, (start, end) in ds[ids_name].items() if ds['dsname'][name] not in exclude_dataset } # special case exclude all the noise data if 'noise' in exclude_dataset: indices = { name: (start, end) for name, (start, end) in indices.items() if '/' not in name } else: indices = {i: j for i, j in ds[ids_name].items()} # ====== down-sampling if necessary ====== # if _args.downsample > 1000: dataset2name = defaultdict(list) # ordering the indices so we sample the same set every time for name in sorted(indices.keys()): dataset2name[ds['dsname'][name]].append(name) n_total_files = len(indices) n_sample_files = int(_args.downsample) # get the percentage of each dataset dataset2per = { i: len(j) / n_total_files for i, j in dataset2name.items() } # sampling based on percentage _ = {} for dsname, flist in dataset2name.items(): rand.shuffle(flist) n_dataset_files = int(dataset2per[dsname] * n_sample_files) _.update({i: indices[i] for i in flist[:n_dataset_files]}) indices = _ # ====== * filter out "bad" sample ====== # indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'], min_utt=min_utt, min_dur=min_dur, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=path_filtered_data) # ====== all training file name ====== # # modify here to train full dataset all_name = sorted(indices.keys()) rand.shuffle(all_name) rand.shuffle(all_name) n_files = len(all_name) print("#Files:", ctext(n_files, 'cyan')) # ====== speaker mapping ====== # name2spk = {name: ds['spkid'][name] for name in all_name} all_speakers = sorted(set(name2spk.values())) spk2label = {spk: i for i, spk in enumerate(all_speakers)} name2label = {name: spk2label[spk] for name, spk in name2spk.items()} assert len(name2label) == len(all_name) print("#Speakers:", ctext(len(all_speakers), 'cyan')) # ====== stratify sampling based on speaker ====== # valid_name = [] # create speakers' cluster label2name = defaultdict(list) for name, label in sorted(name2label.items(), key=lambda x: x[0]): label2name[label].append(name) # for each speaker with >= 3 utterance for label, name_list in sorted(label2name.items(), key=lambda x: x[0]): if len(name_list) < 3: continue n = max(1, int(0.05 * len(name_list))) # 5% for validation valid_name += rand.choice(a=name_list, size=n, replace=False).tolist() # train list is the rest _ = set(valid_name) train_name = [i for i in all_name if i not in _] # ====== split training and validation ====== # train_indices = {name: indices[name] for name in train_name} valid_indices = {name: indices[name] for name in valid_name} # ====== save cached data ====== # with open(path_train_files, 'wb') as fout: pickle.dump({'train': train_indices, 'valid': valid_indices}, fout) with open(path_speaker_info, 'wb') as fout: pickle.dump( { 'all_speakers': all_speakers, 'name2label': name2label, 'spk2label': spk2label }, fout) # ******************** load cached data ******************** # else: with open(path_train_files, 'rb') as fin: obj = pickle.load(fin) train_indices = obj['train'] valid_indices = obj['valid'] with open(path_speaker_info, 'rb') as fin: obj = pickle.load(fin) all_speakers = obj['all_speakers'] name2label = obj['name2label'] spk2label = obj['spk2label'] # ******************** print log ******************** # def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % (dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan')) # ====== training files ====== # print( "#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in train_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in train_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=train_indices) # ====== valid files ====== # print( "#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in valid_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in valid_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=valid_indices) # ******************** create the recipe ******************** # assert all(name in name2label for name in train_indices.keys()) assert all(name in name2label for name in valid_indices.keys()) recipes = prepare_dnn_feeder_recipe(name2label=name2label, n_speakers=len(all_speakers), utt_length=utt_length, seq_mode=seq_mode) # ====== downsample training set for analyzing if required ====== # if train_proportion is not None: assert 0 < train_proportion < 1 n_training = len(train_indices) train_indices = list(train_indices.items()) rand.shuffle(train_indices) rand.shuffle(train_indices) train_indices = dict(train_indices[:int(n_training * train_proportion)]) # ====== create feeder ====== # train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=NCPU, buffer_size=256) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) print(valid_feeder) # ====== debugging ====== # if IS_DEBUGGING: import matplotlib matplotlib.use('Agg') prog = Progbar(target=len(valid_feeder), print_summary=True, name="Iterating validation set") samples = [] n_visual = 250 for name, idx, X, y in valid_feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): assert idx == 0, "Utterances longer than %.2f(sec)" % ( 100000 * Config.STEP_LENGTH) prog['X'] = X.shape prog['y'] = y.shape prog.add(X.shape[0]) # random sampling if rand.rand(1) < 0.5 and len(samples) < n_visual: for i in rand.randint(0, X.shape[0], size=4, dtype='int32'): samples.append((name, X[i], np.argmax(y[i], axis=-1))) # plot the spectrogram n_visual = len(samples) V.plot_figure(nrow=n_visual, ncol=8) for i, (name, X, y) in enumerate(samples): is_noise = '/' in name assert name2label[ name] == y, "Speaker label mismatch for file: %s" % name name = name.split('/')[0] dsname = ds['dsname'][name] spkid = ds['spkid'][name] y = np.argmax(y, axis=-1) ax = V.plot_spectrogram(X.T, ax=(n_visual, 1, i + 1), title='#%d' % (i + 1)) ax.set_title( '[%s][%s]%s %s' % ('noise' if is_noise else 'clean', dsname, name, spkid), fontsize=6) # don't need to be high resolutions V.plot_save('/tmp/tmp.pdf', dpi=12) exit() # ====== return ====== # if bool(return_dataset): return train_feeder, valid_feeder, all_speakers, ds return train_feeder, valid_feeder, all_speakers
if os.path.exists(vector_outpath): with open(vector_outpath, 'rb') as f: vectors = pickle.load(f) if (len(vectors['name']) == len(vectors['y']) == len( vectors['path']) == len(vectors['X']) <= n_files): all_vectors[dsname] = vectors print(' - Loaded vectors at:', ctext(vector_outpath, 'yellow')) if len(vectors['name']) != n_files: print( ' [WARNING] Extracted scores only for: %s/%s (files)' % (ctext(len(vectors['name']), 'lightcyan'), ctext(n_files, 'cyan'))) continue # skip the calculation # ====== create feeder ====== # feeder = F.Feeder(data_desc=F.IndexedData(data=ds_feat, indices=ds_indices), batch_mode='file', ncpu=8) feeder.set_recipes(recipe) # ====== init ====== # output_name = [] output_meta = [] output_path = [] output_data = [] # progress bar prog = Progbar(target=len(feeder), print_summary=True, name='Extract vectors: %s' % dsname) # ====== make prediction ====== # for batch_idx, (name, idx, X) in enumerate( feeder.set_batch(batch_size=100000, seed=None,