def extract_with_vad_and_normalization(self, data_loc, split, threshold=5.5, mean_scale=0.5, cmvn_window=300, var_norm=False): vad_loc = join_path(self.save_loc, VAD_DIR) tmp_loc = join_path(self.save_loc, TMP_DIR) feats_scp = join_path(data_loc, FEATS_SCP_FILE) vad_scp = join_path(data_loc, VAD_SCP_FILE) print('MFCC: Extracting features...') self.extract(data_loc, split) print('MFCC: Computing VAD...') vad = VAD(threshold, mean_scale, n_jobs=self.n_jobs, save_loc=self.save_loc) vad.compute(data_loc, split) print('MFCC: Normalizing features and selecting voiced frames..') feats_scp_dict = spaced_file_to_dict(feats_scp) vad_scp_dict = spaced_file_to_dict(vad_scp) splits = np.array_split(list(feats_scp_dict.keys()), self.n_jobs) for i in range(self.n_jobs): split_feat_scp = open(join_path(tmp_loc, 'feats.{}.scp'.format(i + 1)), 'w') split_vad_scp = open(join_path(tmp_loc, 'vad.{}.scp'.format(i + 1)), 'w') for key in splits[i]: split_feat_scp.write('{} {}\n'.format(key, feats_scp_dict[key])) split_vad_scp.write('{} {}\n'.format(key, vad_scp_dict[key])) split_feat_scp.close() split_vad_scp.close() Kaldi().queue('JOB=1:{nj} {mfcc_loc}/log/voiced_feats.JOB.log ' 'apply-cmvn-sliding --norm-vars={var_norm} --center=true --cmn-window={window} scp:{tmp_loc}/feats.JOB.scp ark:- \| ' 'select-voiced-frames ark:- scp,ns,cs:{tmp_loc}/vad.JOB.scp ark:- \| ' 'copy-feats --compress=true --write-num-frames=ark,t:{mfcc_loc}/log/utt2num_frames.{name}.JOB ark:- ' 'ark,scp:{mfcc_loc}/voiced_feats.{name}.JOB.ark,{mfcc_loc}/voiced_feats.{name}.JOB.scp || exit 1;' .format(mfcc_loc=self.mfcc_loc, tmp_loc=tmp_loc, vad_loc=vad_loc, var_norm='true' if var_norm else 'false', nj=self.n_jobs, window=cmvn_window, name=split)) run_command('for n in $(seq {nj}); do \n' ' cat {mfcc_loc}/voiced_feats.{name}.$n.scp || exit 1;\n' 'done > {data_loc}/voiced_feats.scp || exit 1'.format(mfcc_loc=self.mfcc_loc, data_loc=data_loc, nj=self.n_jobs, name=split)) run_command('for n in $(seq {nj}); do \n' ' cat {mfcc_loc}/log/utt2num_frames.{name}.$n || exit 1;\n' 'done > {data_loc}/utt2num_frames || exit 1'.format(mfcc_loc=self.mfcc_loc, data_loc=data_loc, nj=self.n_jobs, name=split))
def watch_job(job_id, wait=3): time.sleep(wait) completed = False while not completed: output, _ = run_command(QUEUE_JOB_STATUS_CMD.format(job_id)) if len(output.split('\n')) < 2: completed = True time.sleep(3)
def gpu_stat(total_nodes=5, slots_per_node=2): output, _ = run_command(QUEUE_GPU_USAGE_CMD) used_slots = Counter(output.decode("utf-8").split('\n')[:-1]) free_slots = [] gpu_ids = [] for i in range(total_nodes): node = 'compute-0-{}.local'.format(i) try: count = used_slots[node] free_gpu = [-1] * (slots_per_node - count) except KeyError: count = 0 free_gpu = list(range(0, slots_per_node)) free_slots = free_slots + [node] * (slots_per_node - count) gpu_ids = gpu_ids + free_gpu return list(zip(free_slots, gpu_ids))
def delete_jobs(pattern): output, _ = run_command(QUEUE_DELETE_CMD.format(pattern)) return output
def submit_job(job): output, _ = run_command(job) return output.split(' ')[2]