def get_meld_utterances(speech_dir, text_dir): utterances = defaultdict(dict) with open('%s/utt2spk' % (MELD_PATH), 'r') as f: for line in f.readlines(): utterance_id, _ = line.split(' ') if utterance_id in utterances: raise Exception("Duplicate utterance: %s" % utterance_id) utterances[utterance_id]['emotion'] = utterance_id.split('-')[0] if speech_dir != 'none': with ReadHelper('scp:%s/meld/xvector.scp' % speech_dir) as reader: for utterance_id, speech_vector in reader: if utterance_id not in utterances: raise Exception("Speech vector for unknown utterance: %s" % utterance_id) utterances[utterance_id]['speech'] = speech_vector if text_dir != 'none': with open('%s/meld_embedding.pkl' % text_dir, 'rb') as f: meld = pickle.load(f) for _, row in meld.iterrows(): utterance_id, text_embeddings = row['ID'], row[ 'Text Embeddings'] if utterance_id not in utterances: raise Exception("Text vector for unknown utterance: %s" % utterance_id) utterances[utterance_id]['text'] = text_embeddings return utterances
def par_core_extractXvectors(inFeatsScp, outXvecArk, outXvecScp, net, layerName): """ To be called using pytorch multiprocessing Note: This function reads all the data from feats.scp into memory before inference. Hence, make sure the file is not too big (Hint: use split_data_dir.sh) """ activation = {} def get_activation(name): def hook(model, input, output): activation[name] = output.detach() return hook eval('net.%s.register_forward_hook(get_activation(layerName))' % layerName) with kaldi_python_io.ArchiveWriter(outXvecArk, outXvecScp, matrix=False) as writer: with ReadHelper('scp:%s' % inFeatsScp) as reader: for key, mat in reader: out = net(x=torch.Tensor(mat).permute(1, 0).unsqueeze(0).cuda(), eps=0) writer.write(key, np.squeeze(activation[layerName].cpu().numpy()))
def main(): args = parse() spec_time_warp = args.spec_time_warp spec_freq_mask_width = args.spec_freq_mask_width spec_time_mask_width = args.spec_time_mask_width spec_num_freq_masks = args.spec_num_freq_masks spec_num_time_masks = args.spec_num_time_masks spec_time_mask_bound_ratio = args.spec_time_mask_bound_ratio spec_replace_with_zero = args.spec_replace_with_zero featdir = args.spec_feat_dir[0] featscp = os.path.join(featdir, 'feats.scp') with open(featscp) as f: lines = f.readlines() pbar = tqdm(total=len(lines)) feats_dict = {} with ReadHelper('scp:' + featscp) as reader: for key, mat in reader: spec_feat = specaug(torch.from_numpy(mat), spec_time_warp, spec_freq_mask_width, spec_time_mask_width, \ spec_num_freq_masks, spec_num_time_masks, spec_time_mask_bound_ratio, spec_replace_with_zero) feats_dict[key] = spec_feat.numpy() pbar.update(1) with WriteHelper('ark,scp:' + featdir + '/feats_spec.ark,' + featdir + '/feats_spec.scp') as writer: for key, mat in feats_dict.items(): writer(key, mat)
def ReadXvecs(rspec): xvecs = dict() with ReadHelper(rspec) as reader: for utid, xvec in reader: xvecs[utid] = xvec reader.close() return xvecs
def main(): if not os.path.isdir(KALDI_ROOT): print('CHANGE THIS TO YOUR OWN KALDI ROOT: ', KALDI_ROOT) exit() if not os.path.isdir(TIMIT_PATH): print('Invalid path for the kaldi TIMIT dataset: ', TIMIT_PATH) print('Please run the kaldi scripts first! More information are described in the README file and Wiki page.') if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) # read data from the preprocessed kaldi directory for s in SETS: output = {} print('Preprocessing', s, 'data...') cur_dir = os.path.join(OUTPUT_DIR, s.replace('_', '-')) if not os.path.isdir(cur_dir): os.mkdir(cur_dir) for i in range(10): with ReadHelper('ark:' + TIMIT_PATH + s + '/data/feats_fmllr_' + s + '.' + str(i+1) + '.ark') as reader: for key, array in tqdm(reader): array = np.asarray(array).astype('float32') np.save(os.path.join(cur_dir, key), array) output[os.path.join(s.replace('_', '-'), key + '.npy')] = len(array) output = sorted(output.items(), key=operator.itemgetter(1), reverse=True) df = pd.DataFrame(data={'file_path':[fp for fp, l in output], 'length':[l for fp, l in output], 'label':'None'}) df.to_csv(os.path.join(OUTPUT_DIR, s.replace('_', '-') + '.csv')) print('[ARK-TO-TIMIT] - All done, saved at \'' + str(OUTPUT_DIR) + '\', exit.') exit()
def __init__(self, vad_rspec, reg_exp): data = dict() prev = -1 with ReadHelper(vad_rspec) as reader: for utid, prob in reader: result = reg_exp.match(utid) assert result is not None, 'Wrong utterance ID format: \"{}\"'.format( utid) sess_indx = result.group(1) spkr = result.group(2) result = reg_exp.match(sess_indx) assert result is not None, 'Wrong utterance ID format: \"{}\"'.format( sess_indx) sess = result.group(1) indx = int(result.group(2)) sess = sess + '-' + spkr if sess not in data.keys(): assert indx == 1 prev = -1 data[sess] = list() assert indx >= prev data[sess].append(prob) prev = indx reader.close() print(' loaded {} sessions'.format(len(data))) print(' combining fragments') self.data = dict() for sess, items in data.items(): self.data[sess] = np.hstack(items)
def get_cremad_utterances(speech_dir, text_dir): utterances = defaultdict(dict) with open('%s/utt2spk' % (CREMA_D_PATH), 'r') as f: for line in f.readlines(): utterance_id, _ = line.split(' ') if utterance_id in utterances: raise Exception("Duplicate utterance: %s" % utterance_id) utterances[utterance_id]['emotion'] = utterance_id.split('-')[0] if speech_dir != 'none': with ReadHelper('scp:%s/cremad/xvector.scp' % speech_dir) as reader: for utterance_id, speech_vector in reader: if utterance_id not in utterances: raise Exception("Speech vector for unknown utterance: %s" % utterance_id) utterances[utterance_id]['speech'] = speech_vector if text_dir != 'none': embeddings_by_emotion = defaultdict(list) with open('%s/dd_embedding.pkl' % text_dir, 'rb') as f: dd = pickle.load(f) for _, row in dd.iterrows(): utterance_id = row['ID'] emotion = utterance_id.split('-')[0] embeddings_by_emotion[emotion].append(row['Text Embeddings']) for utterance_id, utterance in utterances.items(): emotion = utterance['emotion'] random.seed(utterance_id) random_text_vector = random.choice(embeddings_by_emotion[emotion]) utterance['text'] = random_text_vector return utterances
def load_audio(self, audio_file): if self.audio_feature_type == "mfcc": audio, _ = torchaudio.load(audio_file) try: inputs = self.audio_transforms(audio[:, begin:end]).squeeze(0) except: inputs = self.audio_transforms(audio) inputs = inputs.squeeze(0) elif self.audio_feature_type == "cpc": if os.path.exists(audio_file): audio = np.loadtxt(audio_file) else: with ReadHelper(f"ark: gunzip -c {audio_file} |") as ark_f: for k, audio in ark_f: continue print(audio.size()) # XXX inputs = torch.FloatTensor(audio).t() else: Exception( f"Audio feature type {self.audio_feature_type} not supported") nframes = inputs.size(-1) input_mask = torch.zeros(self.max_feat_len) input_mask[:nframes] = 1. inputs = fix_embedding_length(inputs.t(), self.max_feat_len).t() return inputs, input_mask
def parse_predictions_ark(filepath): utt2predictions = {} with ReadHelper('ark:%s' % (filepath)) as reader: for utterance_id, predictions in reader: if utterance_id in utt2predictions: raise Exception('%s duped in %s' % (utterance_id, filepath)) utt2predictions[utterance_id] = predictions return utt2predictions
def load_xvecs(xvec_file): original_xvecs = {} # Read source original xvectors. with ReadHelper('scp:' + xvec_file) as reader: for key, xvec in reader: # print key, mat.shape original_xvecs[key] = xvec return original_xvecs
def main(): args = parser.parse_args() name_to_phones = {} # from string utterance-id to a NumPy array containing # the phones in 1-based indexing.d with ReadHelper('ark:-') as reader: for key, numpy_array in reader: name_to_phones[key] = numpy_array.astype(np.int8) np.savez_compressed(args.output_file, **name_to_phones)
def data_gen_test_1(): os.chdir('../../') root_dir_test = 'Accent_Data/test/' print('FOR TEST DIR..............') print(os.getcwd()) os.chdir(root_dir_test) print(os.getcwd()) file_to_read = open("test_dict", "rb") test_dict = pickle.load(file_to_read) for i in range(1, 9): with ReadHelper("scp:" + "feats" + str(i) + ".scp") as reader: for key, numpy_array in reader: #ids_train.append(key.split('-')[1]) #if numpy_array.shape[0] > max_len: #max_len = numpy_array.shape[0] if numpy_array.shape[0] < 1000: numpy_array = np.concatenate([ numpy_array, np.array([[0] * 83] * (1000 - numpy_array.shape[0])) ]) elif numpy_array.shape[0] > 1000: numpy_array = numpy_array[:1000] #n = n+1 if key.split('-')[1] == 'AMERICAN': label = np.array([1., 0., 0., 0., 0., 0., 0., 0.]) elif key.split('-')[1] == 'BRITISH': label = np.array([0., 1., 0., 0., 0., 0., 0., 0.]) elif key.split('-')[1] == 'CHINESE': label = np.array([0., 0., 1., 0., 0., 0., 0., 0.]) elif key.split('-')[1] == 'INDIAN': label = np.array([0., 0., 0., 1., 0., 0., 0., 0.]) elif key.split('-')[1] == 'JAPANESE': label = np.array([0., 0., 0., 0., 1., 0., 0., 0.]) elif key.split('-')[1] == 'KOREAN': label = np.array([0., 0., 0., 0., 0., 1., 0., 0.]) elif key.split('-')[1] == 'PORTUGUESE': label = np.array([0., 0., 0., 0., 0., 0., 1., 0.]) elif key.split('-')[1] == 'RUSSIAN': label = np.array([0., 0., 0., 0., 0., 0., 0., 1.]) #yield(numpy_array, {'accent': label, 'gender': np.array([float(1)])} ) if test_dict[key] == 1: yield numpy_array, label # , np.array([float(1)])
def train_models(pool_data, xvec_out_dir, combine_genders=False): # Load and assemble all of the xvectors from the pool sources pool_data_sources = os.listdir(pool_data) pool_data_sources = [ x for x in pool_data_sources if os.path.isdir(join(pool_data, x)) and os.path.isfile(os.path.join(pool_data, x, 'wav.scp')) ] gender_pools = {'m': [], 'f': []} xvector_pool = [] for pool_source in pool_data_sources: print('Adding {} to the pool'.format(join(pool_data, pool_source))) pool_spk2gender_file = join(pool_data, pool_source, 'spk2gender') # Read pool spk2gender pool_spk2gender = {} with open(pool_spk2gender_file) as f: for line in f.read().splitlines(): sp = line.split() pool_spk2gender[sp[0]] = sp[1] # Read pool xvectors pool_xvec_file = join(xvec_out_dir, 'xvectors_' + pool_source, 'spk_xvector.scp') if not os.path.exists(pool_xvec_file): raise ValueError( 'Xvector file: {} does not exist'.format(pool_xvec_file)) with ReadHelper('scp:' + pool_xvec_file) as reader: for key, xvec in reader: # print key, mat.shape xvector_pool.append(xvec) gender = pool_spk2gender[key] gender_pools[gender].append(xvec) print("Read ", len(gender_pools['m']), " male pool xvectors") print("Read ", len(gender_pools['f']), " female pool xvectors") # Fit and train GMMS if combine_genders: transforms = generate_pca_and_gmm(xvector_pool, pca_parameter, random_state=random_seed) else: transforms = {'m': {}, 'f': {}} for gender in ('m', 'f'): gender_xvecs = gender_pools[gender] transforms[gender] = generate_pca_and_gmm(gender_xvecs, pca_parameter, random_state=random_seed) return transforms
def write_features(model, input, output): os.makedirs(output, exist_ok=True) with ReadHelper( f'ark:extract-segments scp:{input}/wav.scp {input}/segments ark:-|' ) as reader: with WriteHelper( f'ark,scp:{output}/feats.ark,{output}/feats.scp') as writer: for key, (sf, wav) in reader: wav = wav.astype(dtype=np.float32) feat = model(wav) feat = np.repeat(feat, 2, axis=0) writer(key, feat)
def main(args): phone_post_dir = args.phone_post_dir output_dir = args.output_dir phone_posteriorgram = 'scp:' + phone_post_dir + '/phone_post.1.scp' with ReadHelper(phone_posteriorgram) as reader: for key, array in reader: array = np.asarray(array) np.save(output_dir + '/phone_post.npy', array)
def data_gen_test(): os.chdir('../../') print('FOR TEST DIR..............') root_dir_test = 'Accent_Data/test/' #print(os.getcwd()) os.chdir(root_dir_test) print(os.getcwd()) for i in range(1, 9): with ReadHelper("scp:" + "feats" + str(i) + ".scp") as reader: for key, numpy_array in reader: #ids_train.append(key.split('-')[1]) #if numpy_array.shape[0] > max_len: #max_len = numpy_array.shape[0] if numpy_array.shape[0] < 1000: numpy_array = np.concatenate([ numpy_array, np.array([[0] * 83] * (1000 - numpy_array.shape[0])) ]) elif numpy_array.shape[0] > 1000: numpy_array = numpy_array[:1000] #n = n+1 if key.split('-')[1] == 'AMERICAN': label = np.array([1., 0., 0., 0., 0., 0., 0., 0.]) elif key.split('-')[1] == 'BRITISH': label = np.array([0., 1., 0., 0., 0., 0., 0., 0.]) elif key.split('-')[1] == 'CHINESE': label = np.array([0., 0., 1., 0., 0., 0., 0., 0.]) elif key.split('-')[1] == 'INDIAN': label = np.array([0., 0., 0., 1., 0., 0., 0., 0.]) elif key.split('-')[1] == 'JAPANESE': label = np.array([0., 0., 0., 0., 1., 0., 0., 0.]) elif key.split('-')[1] == 'KOREAN': label = np.array([0., 0., 0., 0., 0., 1., 0., 0.]) elif key.split('-')[1] == 'PORTUGUESE': label = np.array([0., 0., 0., 0., 0., 0., 1., 0.]) elif key.split('-')[1] == 'RUSSIAN': label = np.array([0., 0., 0., 0., 0., 0., 0., 1.]) yield numpy_array, label
def main(): opt = parse_opt() enroll_mat = [] test_mat = [] enroll_idx_dict = defaultdict(lambda: len(enroll_idx_dict)) test_idx_dict = defaultdict(lambda: len(test_idx_dict)) enroll = 'scp:' + opt.enroll_embedding_path verify = 'scp:' + opt.test_embedding_path with ReadHelper(enroll) as reader: for key, numpy_array in reader: enroll_mat.append(numpy_array) enroll_idx_dict[key] with ReadHelper(verify) as reader: for key, numpy_array in reader: test_mat.append(numpy_array) test_idx_dict[key] enroll_mat = np.stack(enroll_mat) test_mat = np.stack(test_mat) # print(enroll_mat.shape) # print(test_mat.shape) if opt.is_mean: if opt.mean_vec == "self": mean_feat = centering_mean(enroll_mat) else: mean_feat = np.loadtxt(opt.mean_vec) enroll_mat = enroll_mat - mean_feat test_mat = enroll_mat - mean_feat enroll_idx_dict = dict(enroll_idx_dict) test_idx_dict = dict(test_idx_dict) assert enroll_mat.ndim == 2, "dimension should be 2" generate_score(enroll_mat, test_mat, enroll_idx_dict, test_idx_dict, opt.trial_list, opt.result_cosine)
def main(): opt = parse_opt() whole_mat = [] cohort_mat = [] whole_idx_dict = defaultdict(lambda: len(whole_idx_dict)) whole = 'scp:' + opt.whole_trial_embedding_path cohort = 'scp:' + opt.cohort_embedding_path with ReadHelper(whole) as reader: for key, numpy_array in reader: whole_mat.append(numpy_array) whole_idx_dict[key] with ReadHelper(cohort) as reader: for key, numpy_array in reader: cohort_mat.append(numpy_array) whole_mat = np.stack(whole_mat) cohort_mat = np.stack(cohort_mat) assert whole_mat.ndim == 2 and cohort_mat.ndim == 2, "dimension should be 2" do_asnorm = not opt.snorm stats_dict = generate_stat(whole_mat, cohort_mat, whole_idx_dict, do_asnorm, opt.topN) output_score(opt.raw_scores, stats_dict, opt.result_snorm)
def compute_gop(df_phones_pure, df_alignments): gop = {} with ReadHelper('ark:loglikes.ark') as reader: for key, loglikes in tqdm.tqdm(reader): loglikes = softmax(np.array(loglikes), axis=1) #Apply softmax before computing df_scores = pd.DataFrame(df_alignments.loc[:, key]).transpose() df_scores['p'] = [loglikes] gop[key] = gop_robust_with_matrix(df_scores, df_phones_pure, 6024, 1, [], []) with open('gop_epa.pickle', 'wb') as handle: pickle.dump(gop, handle, protocol=pickle.HIGHEST_PROTOCOL)
def extract_mfcc(name, original_mfcc_dir, mfcc_npy_root_dir): utt_id2mfcc = {} for scp in glob(p_join(original_mfcc_dir, 'raw_mfcc_{}.*.scp'.format(name))): num = scp.split('.')[-2] print('extract:', scp) mfcc_npy_dir = p_join(mfcc_npy_root_dir, name + '.' + num) os.makedirs(mfcc_npy_dir, exist_ok=True) with ReadHelper('scp:' + scp) as reader: for utt_id, mfcc in reader: #print(utt_id, mfcc.shape) mfcc_npy = p_join(mfcc_npy_dir, utt_id + '.npy') #print(mfcc_npy) np.save(mfcc_npy, mfcc) utt_id2mfcc[utt_id] = mfcc_npy return utt_id2mfcc
def get_iemocap_utterances(speech_dir, text_dir, subset): utterances = defaultdict(dict) with open('%s/utt2spk' % (IEMOCAP_PATH), 'r') as f: for line in f.readlines(): utterance_id, _ = line.split(' ') session = int(utterance_id.split('-')[2][0:2]) if session != int(subset[-1:]): continue if utterance_id in utterances: raise Exception("Duplicate utterance: %s" % utterance_id) utterances[utterance_id]['session'] = session utterances[utterance_id]['emotion'] = utterance_id.split('-')[0] if speech_dir != 'none': with ReadHelper('scp:%s/iemocap/xvector.scp' % speech_dir) as reader: for utterance_id, speech_vector in reader: session = int(utterance_id.split('-')[2][0:2]) if session != int(subset[-1:]): continue if utterance_id not in utterances: raise Exception("Speech vector for unknown utterance: %s" % utterance_id) utterances[utterance_id]['speech'] = speech_vector if text_dir != 'none': with open('%s/iemocap_embedding.pkl' % text_dir, 'rb') as f: iemocap = pickle.load(f) for _, row in iemocap.iterrows(): utterance_id = row['ID'] session = int(utterance_id.split('-')[2][0:2]) if session != int(subset[-1:]): continue if utterance_id not in utterances: raise Exception("Text vector for unknown utterance: %s" % utterance_id) utterances[utterance_id]['text'] = row['Text Embeddings'] return utterances
def get_train_data(): feat_arr_train = [] ids_train = [] root_dir_train = 'TIMIT_Data/speed_perturbation_80fbanks3pitchs/dump/trainNet_sp/deltafalse/' # Root directory for training set os.chdir(root_dir_train) for i in range(1, 21): with ReadHelper('scp:' + 'feats' + str(i) + '.scp') as reader: for key, numpy_array in reader: ids_train.append(key) numpy_array = spec_augment(numpy_array) if numpy_array.shape[0] < 800: numpy_array = pad_sequences(numpy_array.T, maxlen=800, padding='post') numpy_array = numpy_array.T elif numpy_array.shape[0] > 800: numpy_array = numpy_array[:800] # Incorporating gender information as a binary feature: if key[0] == 'F': # i.e. for female samples #female_feat_arr_train.append(numpy_array) #feat_arr_train.append(numpy_array) feat_arr_train.append( np.concatenate( (numpy_array, np.array([1] * 800).reshape(800, 1)), axis=1)) elif key[0] == 'M': # i.e. for male samples #male_feat_arr_train.append(numpy_array) #feat_arr_train.append(numpy_array) feat_arr_train.append( np.concatenate( (numpy_array, np.array([0] * 800).reshape(800, 1)), axis=1)) else: print('ERROR! ' + str(key)) # Coming back to main directory os.chdir('../../../../../') return feat_arr_train, ids_train
def __init__(self, vad_rspec, reg_exp): data = dict() prev = -1 with ReadHelper(vad_rspec) as reader: for utid, align in reader: result = reg_exp.match(utid) assert result is not None, 'Wrong VAD alignment utterance ID format: \"{}\"'.format( utid) sess = result.group(1) piece = result.group(2) spkr = result.group(3) if sess not in data.keys(): data[sess] = dict() if piece not in data[sess].keys(): data[sess][piece] = dict() data[sess][piece][spkr] = align reader.close() print(' loaded {} sessions'.format(len(data))) self.data = data
def test_nosil(generator, ds_test, device, mindcf=False): generator.eval() all_embeds = [] all_utts = [] num_examples = len(ds_test.veri_utts) with torch.no_grad(): with ReadHelper( 'ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:{0}/feats_trimmed.scp ' 'ark:- | select-voiced-frames ark:- scp:{0}/vad_trimmed.scp ark:- |' .format(ds_test.data_base_path)) as reader: for key, feat in tqdm(reader, total=num_examples): if key in ds_test.veri_utts: all_utts.append(key) feats = torch.FloatTensor(feat).unsqueeze(0).to(device) embeds = generator(feats) all_embeds.append(embeds.cpu().numpy()) metric = SpeakerRecognitionMetrics(distance_measure='cosine') all_embeds = np.vstack(all_embeds) all_embeds = normalize(all_embeds, axis=1) all_utts = np.array(all_utts) print(all_embeds.shape, len(ds_test.veri_utts)) utt_embed = OrderedDict({k: v for k, v in zip(all_utts, all_embeds)}) emb0 = np.array([utt_embed[utt] for utt in ds_test.veri_0]) emb1 = np.array([utt_embed[utt] for utt in ds_test.veri_1]) scores = metric.scores_from_pairs(emb0, emb1) fpr, tpr, thresholds = roc_curve(1 - ds_test.veri_labs, scores, pos_label=1, drop_intermediate=False) eer = metric.eer_from_ers(fpr, tpr) generator.train() if mindcf: mindcf1 = metric.compute_min_dcf(fpr, tpr, thresholds, p_target=0.01) mindcf2 = metric.compute_min_dcf(fpr, tpr, thresholds, p_target=0.001) return eer, mindcf1, mindcf2 else: return eer
import sys from glob import iglob, glob from os.path import basename, dirname, join as p_join import json import numpy as np from kaldiio import ReadHelper, WriteHelper vad_scp = sys.argv[1] utt2num_frames = sys.argv[2] utt_ids = [] vads = [] utt_id2vad = {} with ReadHelper('scp:' + vad_scp) as vad_reader: #for i, (utt_id, vad) in enumerate(vad_reader): # utt_id2vad[utt_id] = vad # print(utt_id, len(vad)) with open(utt2num_frames, 'w') as writer: for i, (utt_id, vad) in enumerate(vad_reader): utt_id2vad[utt_id] = vad print(utt_id, len(vad), file=writer) #print('vad reading completed') #with ReadHelper('scp:' + mfcc_scp) as mfcc_reader: # for i, (utt_id, mfcc) in enumerate(mfcc_reader): # if utt_id in utt_id2vad.keys(): # vad = utt_id2vad[utt_id] # assert(len(vad) == len(mfcc)) # print(i, 'check', vad.shape, mfcc.shape)
def make_asr_data(src_file, tgt_file, tgt_dicts, max_src_length=64, max_tgt_length=64, input_type='word', stride=1, concat=1, prev_context = 0, fp16=False, reshape=True,asr_format="h5"): src, tgt = [], [] # sizes = [] src_sizes = [] tgt_sizes = [] count, ignored = 0, 0 n_unk_words = 0 print('Processing %s & %s ...' % (src_file, tgt_file)) if(asr_format == "h5"): fileIdx = -1; if(src_file[-2:] == "h5"): srcf = h5.File(src_file,'r') else: fileIdx = 0 srcf = h5.File(src_file+"."+str(fileIdx)+".h5",'r') elif(asr_format == "scp"): import kaldiio from kaldiio import ReadHelper audio_data = iter(ReadHelper('scp:'+src_file)) tgtf = open(tgt_file) index = 0 s_prev_context = [] t_prev_context = [] while True: tline = tgtf.readline() # normal end of file if tline == "": break if(asr_format == "h5" ): if(str(index) in srcf): featureVectors = np.array(srcf[str(index)]) elif(fileIdx != -1): srcf.close() fileIdx += 1 srcf = h5.File(src_file+"."+str(fileIdx)+".h5",'r') featureVectors = np.array(srcf[str(index)]) else: print("No feature vector for index:",index,file=sys.stderr) exit(-1) elif(asr_format == "scp"): _,featureVectors = next(audio_data) featureVectors = featureVectors[:, :40] #if index == 0: # print(len(featureVectors), featureVectors.shape) if(stride == 1): sline = torch.from_numpy(featureVectors) else: sline = torch.from_numpy(featureVectors[0::opt.stride]) if reshape: if concat != 1: add = (concat-sline.size()[0]%concat)%concat z= torch.FloatTensor(add, sline.size()[1]).zero_() sline = torch.cat((sline,z),0) sline = sline.reshape((int(sline.size()[0]/concat), sline.size()[1]*concat)) index += 1; tline = tline.strip() if prev_context > 0: print("Multiple ASR context isn't supported at the moment ") raise NotImplementedError # s_prev_context.append(sline) # t_prev_context.append(tline) # for i in range(1,prev_context+1): # if i < len(s_prev_context): # sline = torch.cat((torch.cat((s_prev_context[-i-1],torch.zeros(1,sline.size()[1]))),sline)) # tline = t_prev_context[-i-1]+" # "+tline # if len(s_prev_context) > prev_context: # s_prev_context = s_prev_context[-1*prev_context:] # t_prev_context = t_prev_context[-1*prev_context:] # source and/or target are empty if tline == "": print('WARNING: ignoring an empty line (' + str(count + 1) + ')') continue if input_type == 'word': tgt_words = tline.split() elif input_type == 'char': tgt_words = split_line_by_char(tline) if len(tgt_words) <= max_tgt_length - 2 and sline.size(0) <= max_src_length: # Check truncation condition. if opt.tgt_seq_length_trunc != 0: tgt_words = tgt_words[:opt.tgt_seq_length_trunc] if fp16: sline = sline.half() src += [sline] tgt_tensor = tgt_dicts.convertToIdx(tgt_words, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD) tgt += [tgt_tensor] src_sizes += [len(sline)] tgt_sizes += [len(tgt_words)] unks = tgt_tensor.eq(onmt.Constants.UNK).sum().item() n_unk_words += unks if unks > 0: if "<unk>" not in tline: print("DEBUGGING: This line contains UNK: %s" % tline) else: ignored += 1 count += 1 if count % opt.report_every == 0: print('... %d sentences prepared' % count) if (asr_format == "h5"): srcf.close() tgtf.close() print('Total number of unk words: %d' % n_unk_words) if opt.shuffle == 1: print('... shuffling sentences') perm = torch.randperm(len(src)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] src_sizes = [src_sizes[idx] for idx in perm] tgt_sizes = [tgt_sizes[idx] for idx in perm] print('... sorting sentences by size') # _, perm = torch.sort(torch.Tensor(sizes), descending=(opt.sort_type == 'descending')) # src = [src[idx] for idx in perm] # tgt = [tgt[idx] for idx in perm] z = zip(src, tgt, src_sizes, tgt_sizes) # ultimately sort by source size sorted_z = sorted(sorted(z, key=lambda x: x[3]), key=lambda x: x[2]) src = [z_[0] for z_ in sorted_z] tgt = [z_[1] for z_ in sorted_z] print(('Prepared %d sentences ' + '(%d ignored due to length == 0 or src len > %d or tgt len > %d)') % (len(src), ignored, max_src_length, max_tgt_length)) return src, tgt
from kaldiio import ReadHelper import numpy as np m = 0 nsp = np.array([]) abss = [] with ReadHelper('scp:spk_xvector.scp') as reader: for key, ab in reader: if m == 0: nsp = np.array(ab).reshape(1, -1) else: nsp = np.vstack((nsp, ab.reshape(1, -1))) m += 1 print(nsp.shape) np.save("spk_vector_bigd.npy", nsp)
sc = ax.scatter(x, y, **kw) if (m is not None) and (len(m) == len(x)): paths = [] for marker in m: if isinstance(marker, mmarkers.MarkerStyle): marker_obj = marker else: marker_obj = mmarkers.MarkerStyle(marker) path = marker_obj.get_path().transformed( marker_obj.get_transform()) paths.append(path) sc.set_paths(paths) return sc with ReadHelper('scp:%s' % args.feat_scp_1) as reader: for key, numpy_array in reader: # torch_array = F.normalize(torch.from_numpy(numpy_array),dim=0) # numpy_array = torch_array.numpy() #print(numpy_array) data.append(numpy_array) speaker.append(key.split("_")[0]) marker.append("x") alpha.append(1.0) with ReadHelper('scp:%s' % args.feat_scp_2) as reader: for key, numpy_array in reader: # torch_array = F.normalize(torch.from_numpy(numpy_array),dim=0) # numpy_array = torch_array.numpy() data.append(numpy_array) speaker.append(key.split("_")[0]) marker.append("o")
args = sys.argv data_dir = args[1] target_spk = args[2] out_dir = args[3] dataset_of_target = args[4] dataname = basename(data_dir) yaap_pitch_dir = join(data_dir, 'yaapt_pitch') pitch_out_dir = join(out_dir, "f0") statsdir = "exp/vc_toolkit_exp_voice_privacy/feats/f0/" # Write pitch features pitch_file = join(data_dir, 'pitch.scp') pitch2shape = {} with ReadHelper('scp:' + pitch_file) as reader: for key, mat in reader: pitch2shape[key] = mat.shape[0] kaldi_f0 = mat[:, 1].squeeze().copy() yaapt_f0 = readwrite.read_raw_mat(join(yaap_pitch_dir, key + '.f0'), 1) #unvoiced = np.where(yaapt_f0 == 0)[0] #kaldi_f0[unvoiced] = 0 #readwrite.write_raw_mat(kaldi_f0, join(pitch_out_dir, key+'.f0')) if kaldi_f0.shape < yaapt_f0.shape: print("Warning yaapt_f0 > kaldi_f0 for utt:", key) yaapt_f0 = yaapt_f0[:kaldi_f0.shape[0]] f0 = np.zeros(kaldi_f0.shape) f0[:yaapt_f0.shape[0]] = yaapt_f0 source_stats = {} with open(statsdir + dataname + "/" +
def inference(rspecifier, wspecifier, model_type, model_path, model_config, bnf_feature_kind, data_config, output_txt): stat_dict = data_config.get('statistic_file', None) feat_kind = data_config.get('feature_kind', 'mel-id') filter_length = data_config.get('filter_length', 1024) hop_length = data_config.get('hop_length', 256) win_length = data_config.get('win_length', 1024) n_mel_channels = data_config.get('n_mel_channels', 80) sampling_rate = data_config.get('sampling_rate', 24000) mel_fmin = data_config.get('mel_fmin', 80) mel_fmax = data_config.get('mel_fmax', 7600) feature_kinds = feat_kind.split('-') assert feature_kinds[0] in ['mel'] assert bnf_feature_kind in ['id', 'csid', 'token'] module = import_module('model.{}'.format(model_type), package=None) model = getattr(module, 'Model')(model_config) model.load_state_dict(torch.load(model_path, map_location='cpu')['model']) model.cuda().eval() # Read stat scp if stat_dict is None: with open(data_config.get('training_dir', '') / 'stat.scp', 'r') as rf: stat_dict = [line.rstrip() for line in rf.readlines()][0] elif stat_dict.split('.')[-1] == 'scp': with open(stat_dict, 'r') as rf: stat_dict = [line.rstrip() for line in rf.readlines()][0] feat_stat = torch.load(stat_dict) feat_fn = MelSpectrum(filter_length=filter_length, hop_length=hop_length, win_length=win_length, n_mel_channels=n_mel_channels, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax, feat_stat=feat_stat).cuda() if output_txt and bnf_feature_kind in ['id', 'csid']: bnf_writer = open(wspecifier, 'w') else: bnf_writer = WriteHelper(bnf_writer, compression_method=1) output_txt = False for utt, (rate, X) in ReadHelper(rspecifier): X = X.astype(np.float32) / MAX_WAV_VALUE X = librosa.core.resample(X, rate, sampling_rate, res_type='kaiser_best') if np.max(np.abs(X)) >= 1.0: X /= np.max(np.abs(X)) # Extract features X = feat_fn(torch.from_numpy(X).cuda().unsqueeze(0)) X = feat_fn.normalize(X) X_in = X['mel'] with torch.no_grad(): z = model.encoder(X_in) z_id = model.quantizer.encode(z) z_vq = model.quantizer.decode(z_id) # Save converted feats if bnf_feature_kind == 'id': X_bnf = z_id.view(-1).cpu().numpy() if bnf_feature_kind == 'csid': X_bnf = z_id.view(-1).unique_consecutive().cpu().numpy() elif bnf_feature_kind == 'token': X_bnf = z_vq.squeeze(0).t().cpu().numpy() if output_txt: X_bnf = X_bnf.reshape(-1) X_bnf = ''.join(['<{}>'.format(bnf) for bnf in X_bnf]) bnf_writer.write('{} {}\n'.format(utt, X_bnf)) else: bnf_writer.write(utt, X_bnf) print('Extracting BNF {} of {}.'.format(bnf_feature_kind, utt), end=' ' * 30 + '\r') bnf_writer.close()