def reco_files(self): fnames = QFileDialog.getOpenFileNames(self, "Select Wav Files", "", "Files (*.wav)") print 'reco_files' for f in fnames: fs, sig = read_wav(f) newsig = self.backend.filter(fs, sig) label = self.backend.predict(fs, newsig) print f, label
def enroll_file(self): fname = QFileDialog.getOpenFileName(self, "Open Wav File", "", "Files (*.wav)") if not fname: return self.status(fname) self.enrollFileName.setText(fname) fs, signal = read_wav(fname) signal = monophonic(signal) self.enrollWav = (fs, signal)
def reco_file(self): fname = QFileDialog.getOpenFileName(self, "Open Wav File", "", "Files (*.wav)") print 'reco_file' if not fname: return self.status(fname) fs, signal = read_wav(fname) self.reco_do_predict(fs, signal)
def __init__(self): self.backend = ModelInterface.load(self.INPUT_MODEL) try: fs, signal = read_wav(self.BG) self.backend.init_noise(fs, signal) except: print "file not found!" self.pub = rospy.Publisher('/speaker',String,queue_size = 10) self.sub = rospy.Subscriber('/wav',numpy_msg(Floats),self.task_predict)
def __init__(self, parent=None): QWidget.__init__(self, parent) uic.loadUi("edytor.ui", self) self.statusBar() self.timer = QTimer(self) self.timer.timeout.connect(self.timer_callback) self.noiseButton.clicked.connect(self.noise_clicked) self.recording_noise = False self.loadNoise.clicked.connect(self.load_noise) self.enrollRecord.clicked.connect(self.start_enroll_record) self.stopEnrollRecord.clicked.connect(self.stop_enroll_record) self.enrollFile.clicked.connect(self.enroll_file) self.enroll.clicked.connect(self.do_enroll) self.startTrain.clicked.connect(self.start_train) self.dumpBtn.clicked.connect(self.dump) self.loadBtn.clicked.connect(self.load) self.recoRecord.clicked.connect(self.start_reco_record) self.stopRecoRecord.clicked.connect(self.stop_reco_record) # self.newReco.clicked.connect(self.new_reco) self.recoFile.clicked.connect(self.reco_file) self.recoInputFiles.clicked.connect(self.reco_files) #UI.init self.userdata =[] self.loadUsers() self.Userchooser.currentIndexChanged.connect(self.showUserInfo) self.ClearInfo.clicked.connect(self.clearUserInfo) self.UpdateInfo.clicked.connect(self.updateUserInfo) self.UploadImage.clicked.connect(self.upload_avatar) #movie test self.movie = QMovie(u"image/recording.gif") self.movie.start() self.movie.stop() self.Animation.setMovie(self.movie) self.Animation_2.setMovie(self.movie) self.Animation_3.setMovie(self.movie) self.aladingpic = QPixmap(u"image/a_hello.png") self.Alading.setPixmap(self.aladingpic) self.Alading_conv.setPixmap(self.aladingpic) #default user image setting self.avatarname = "image/nouser.jpg" self.defaultimage = QPixmap(self.avatarname) self.Userimage.setPixmap(self.defaultimage) self.recoUserImage.setPixmap(self.defaultimage) self.convUserImage.setPixmap(self.defaultimage) self.load_avatar('avatar/') # Graph Window init self.graphwindow = GraphWindow() self.newname = "" self.lastname = "" self.Graph_button.clicked.connect(self.graphwindow.show) self.convRecord.clicked.connect(self.start_conv_record) self.convStop.clicked.connect(self.stop_conv) self.backend = ModelInterface() # debug QShortcut(QKeySequence("Ctrl+P"), self, self.printDebug) #init try: fs, signal = read_wav("bg.wav") self.backend.init_noise(fs, signal) except: pass
def load_noise(self): fname = QFileDialog.getOpenFileName(self, "Open Data File:", "", "Wav File (*.wav)") if fname: fs, signal = read_wav(fname) self.backend.init_noise(fs, signal)
for g in tqdm(GENRE): # print(GENRE) FILES = glob(DB + '/wav/' + g + '/*.wav') label, pred_t1, pred_t2, P_score, ALOTC_score = list(), list(), list(), list(), list() for f in FILES: f = f.replace('\\', '/') # print('FILE:', f) # Read the labeled tempo(ground-truth tempo) bpm = float(utils.read_tempofile(DB, f)) # print(bpm) label.append(bpm) # Compute local onset autocorrelation sr, y = utils.read_wav(f) hop_length = 512 onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length, n_fft=2048) # tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr, hop_length=hop_length) tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr, hop_length=hop_length) # predict the tempo1(slower one), tempo2(faster one) # tempo1, tempo2 = librosa.beat.tempo(onset_envelope=onset_env, sr=sr, hop_length=hop_length) tempo1, tempo2 = utils.tempo(onset_envelope=onset_env, sr=sr, hop_length=hop_length) pred_t1.append(tempo1) pred_t2.append(tempo2) # print(tempo1, tempo2) # p score s1 = tempo1/(tempo1+tempo2) s2 = 1.0 - s1
def open_recording(self, file_path, speakers, side=None, silence_length=2.0): """Open combined recording and splits it into separate speaker-ear pairs. Args: file_path: Path to recording file. speakers: Sequence of recorded speakers. side: Which side (ear) tracks are contained in the file if only one. "left" or "right" or None for both. silence_length: Length of silence used during recording in seconds. Returns: None """ if self.fs != self.estimator.fs: raise ValueError( 'Refusing to open recording because HRIR\'s sampling rate doesn\'t match impulse response ' 'estimator\'s sampling rate.') fs, recording = read_wav(file_path, expand=True) if fs != self.fs: raise ValueError( 'Sampling rate of recording must match sampling rate of test signal.' ) if silence_length * self.fs != int(silence_length * self.fs): raise ValueError( 'Silence length must produce full samples with given sampling rate.' ) silence_length = int(silence_length * self.fs) # 2 tracks per speaker when side is not specified, only 1 track per speaker when it is tracks_k = 2 if side is None else 1 # Number of speakers in each track n_columns = round(len(speakers) / (recording.shape[0] // tracks_k)) # Crop out initial silence recording = recording[:, silence_length:] # Split sections in time to columns columns = [] column_size = silence_length + len(self.estimator) for i in range(n_columns): columns.append(recording[:, i * column_size:(i + 1) * column_size]) # Split each track by columns i = 0 while i < recording.shape[0]: for j, column in enumerate(columns): n = int(i // 2 * len(columns) + j) speaker = speakers[n] if speaker not in SPEAKER_NAMES: # Skip non-standard speakers. Useful for skipping the other sweep in center channel recording. continue if speaker not in self.irs: self.irs[speaker] = dict() if side is None: # Left first, right then self.irs[speaker]['left'] = ImpulseResponse( self.estimator.estimate(column[i, :]), self.fs, column[i, :]) self.irs[speaker]['right'] = ImpulseResponse( self.estimator.estimate(column[i + 1, :]), self.fs, column[i + 1, :]) else: # Only the given side self.irs[speaker][side] = ImpulseResponse( self.estimator.estimate(column[i, :]), self.fs, column[i, :]) i += tracks_k
def task_predict(input_files, input_model): m = ModelInterface.load(input_model) for f in glob.glob(os.path.expanduser(input_files)): fs, signal = read_wav(f) label, score = m.predict(fs, signal) print(f, '->', label, ", score->", score)
def __init__(self, parent=None): QWidget.__init__(self, parent) uic.loadUi("edytor.ui", self) self.statusBar() self.timer = QTimer(self) self.timer.timeout.connect(self.timer_callback) self.noiseButton.clicked.connect(self.noise_clicked) self.recording_noise = False self.loadNoise.clicked.connect(self.load_noise) self.enrollRecord.clicked.connect(self.start_enroll_record) self.stopEnrollRecord.clicked.connect(self.stop_enroll_record) self.enrollFile.clicked.connect(self.enroll_file) self.enroll.clicked.connect(self.do_enroll) self.startTrain.clicked.connect(self.start_train) self.dumpBtn.clicked.connect(self.dump) self.loadBtn.clicked.connect(self.load) self.recoRecord.clicked.connect(self.start_reco_record) self.stopRecoRecord.clicked.connect(self.stop_reco_record) # self.newReco.clicked.connect(self.new_reco) self.recoFile.clicked.connect(self.reco_file) self.recoInputFiles.clicked.connect(self.reco_files) #UI.init self.userdata = [] self.loadUsers() self.Userchooser.currentIndexChanged.connect(self.showUserInfo) self.ClearInfo.clicked.connect(self.clearUserInfo) self.UpdateInfo.clicked.connect(self.updateUserInfo) self.UploadImage.clicked.connect(self.upload_avatar) #movie test self.movie = QMovie(u"image/recording.gif") self.movie.start() self.movie.stop() self.Animation.setMovie(self.movie) self.Animation_2.setMovie(self.movie) self.Animation_3.setMovie(self.movie) self.aladingpic = QPixmap(u"image/a_hello.png") self.Alading.setPixmap(self.aladingpic) self.Alading_conv.setPixmap(self.aladingpic) #default user image setting self.avatarname = "image/nouser.jpg" self.defaultimage = QPixmap(self.avatarname) self.Userimage.setPixmap(self.defaultimage) self.recoUserImage.setPixmap(self.defaultimage) self.convUserImage.setPixmap(self.defaultimage) self.load_avatar('avatar/') # Graph Window init self.graphwindow = GraphWindow() self.newname = "" self.lastname = "" self.Graph_button.clicked.connect(self.graphwindow.show) self.convRecord.clicked.connect(self.start_conv_record) self.convStop.clicked.connect(self.stop_conv) self.backend = ModelInterface() # debug QShortcut(QKeySequence("Ctrl+P"), self, self.printDebug) #init try: fs, signal = read_wav("bg.wav") self.backend.init_noise(fs, signal) except: pass
def reverberate_and_mix(out_folder, sources_folder, rir_folder, mix_info, scale_rirs=10.0, part=0, nparts=8, chat=True): """Reverberate and mix sources.""" list_mix = sorted(mix_info.keys()) list_len = len(list_mix) partsize = list_len // nparts assert part < nparts start = part * partsize end = list_len if part == nparts - 1 else (part + 1) * partsize if start == end: raise ValueError('Not enough mixtures to generate. Part {} of {} to ' 'generate a total of {} mixtures.'.format( part, nparts, list_len)) print('Reverberating and mixing from {} to {} ' 'out of {}.'.format(start, end, list_len)) for mix in list_mix[start:end]: sources, rirs = mix_info[mix] mix_to_data = [] max_src_len = -1 if chat: print('--\n{} ='.format(mix)) for source, rir in zip(sources, rirs): source_path = os.path.join(sources_folder, source) src_data, samplerate_src = read_wav(source_path, always_2d=True) rir_path = os.path.join(rir_folder, rir) rir_data, samplerate_rir = read_wav(rir_path, always_2d=True) assert samplerate_src == samplerate_rir # Pick channel 0 of src_data. src_data = src_data[:, 0] # Pick channel 0 of rirs and scale it. rir_data = scale_rirs * rir_data[:, 0] rir_len = len(rir_data) src_len = len(src_data) rir_max = np.max(np.abs(rir_data)) src_max = np.max(np.abs(src_data)) max_src_len = np.maximum(src_len, max_src_len) if chat: print('+ {} [{}, {:1.2f}] * {} [{}, {:1.2f}]'.format( source, src_len, src_max, rir, rir_len, rir_max)) mix_to_data.append([src_data, rir_data, source, rir]) mix_rev_sources = [] rir_paths_used = [] for data in mix_to_data: src_data, rir_data, source_relpath, rir_relpath = data rir_paths_used.append(rir_relpath) src_len = len(src_data) if src_len < max_src_len: print('WARNING: original source data has {} samples ' 'for source file {}, zero padding ' 'to size {}.'.format(src_len, source_relpath, max_src_len)) src_data = np.concatenate( (src_data, np.zeros(max_src_len - src_len)), axis=0) rev_src_data = np.convolve(src_data, rir_data, 'same') # Write reverberated source data. rev_src_path = os.path.join(out_folder, source_relpath) os.makedirs(os.path.dirname(rev_src_path), exist_ok=True) write_wav(rev_src_path, rev_src_data, samplerate_src) mix_rev_sources.append(rev_src_data) mixed_rev_data = np.sum(np.stack(mix_rev_sources, axis=0), axis=0) mix_wav_path = os.path.join(out_folder, mix) mix_wav_base = mix_wav_path.rstrip('.wav') write_wav(mix_wav_path, mixed_rev_data, samplerate_src) in_wav_path = os.path.join(sources_folder, mix) in_wav_base = in_wav_path.rstrip('.wav') if os.path.exists(in_wav_base + '.jams'): shutil.copyfile(in_wav_base + '.jams', mix_wav_base + '.jams') if os.path.exists(in_wav_base + '.txt'): with open(in_wav_base + '.txt', 'r') as f: lines = f.readlines() with open(mix_wav_base + '.txt', 'w') as f: f.write(''.join(lines)) f.write('\nroom impulse responses used:\n{}'.format( '\n'.join(rir_paths_used)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--file', type=str, required=True, help='Path to HRIR or HeSuVi file.') parser.add_argument( '--track_order', type=str, required=True, help='Track order in HRIR file. "hesuvi" or "hexadecagonal"') parser.add_argument( '--reverb', type=str, default=argparse.SUPPRESS, help= 'Reverberation times for different channels in milliseconds. During this time the ' 'reverberation tail will be reduced by 100 dB. A comma separated list of channel name and ' 'reverberation time pairs, separated by colon. If only a single numeric value is given, ' 'it is used for all channels. When some channel names are give but not all, the missing ' 'channels are not affected. Must be at least 3 ms smaller than the HRIR length. ' 'For example "--reverb=300" or ' '"--reverb=FL:500,FC:100,FR:500,SR:700,BR:700,BL:700,SL:700" or ' '"--reverb=FC:100".') args = parser.parse_args() file_path = args.file track_order = args.track_order reverb = dict() try: # Single float value reverb = {ch: float(args.reverb) / 1000 for ch in SPEAKER_NAMES} except ValueError: # Channels separated for ch_t in args.reverb.split(','): reverb[ch_t.split(':')[0].upper()] = float( ch_t.split(':')[1]) / 1000 fs, data = read_wav(file_path) for ch, t in reverb.items(): print(f'{ch}: {t*1000:.0f}ms') n_ones = int(fs * 0.003) n_win = int(fs * t) win = np.concatenate([ np.ones(n_ones), signal.windows.hann(n_win * 2)[n_win:], np.zeros(data.shape[1] - n_ones - n_win) ]) - 1.0 win *= 100 # 100 dB win = 10**(win / 20) # Linear scale if track_order == 'hesuvi': tracks = [ i for i in range(len(HESUVI_TRACK_ORDER)) if ch in HESUVI_TRACK_ORDER[i] ] elif track_order == 'hexadecagonal': tracks = [ i for i in range(len(HEXADECAGONAL_TRACK_ORDER)) if ch in HEXADECAGONAL_TRACK_ORDER[i] ] else: raise ValueError( f'Invalid track_order "{track_order}", allowed values are "hesuvi" and "hexadecagonal"' ) for i in tracks: data[i, :] *= win # Write WAV write_wav(os.path.join(DIR_PATH, 'cropped.wav'), fs, data)
def task_predict(input_files, input_model): # 把输入的多个模型目录字符串分离为目录列表 input_models = [os.path.expanduser(k) for k in input_model.strip().split()] # 把各个目录下的模型列表解压出来组合成一个迭代器 models = itertools.chain(*(glob.glob(m) for m in input_models)) # 生成并加载包括所有模型文件(skgmm.GMMSet object)的列表 models = [ModelInterface.load(m) for m in models] if len(models) == 0: print("No model file found in %s" % input_model) sys.exit(1) # 定义统计准确率的变量 right = 0 right1 = 0 wrong = 0 wrong1 = 0 num = 0 # 对每个预测音频文件提取特征并与每个模型匹配得到TOP结果 for f in glob.glob(os.path.expanduser(input_files)): start_time = time.time() fs, signal = read_wav(f) print(f) feat = get_feature(fs, signal) #print("Get feature ", time.time() - start_time, " seconds") predict_result = [] f_models = [(feat, m) for m in models] #print(models) # 每个音频文件分别匹配每个模型组并得出分数放到总列表 # for model in models: # #start_time1 = time.time() # #print(model) # # 模型文件是一个元组:(label,gmm) # score = model[1].score(feat) # label=model[0] # result=(label,score) # #print(results) # predict_result.append(result) #print("Get one score ", time.time() - start_time1, " seconds") pool = ThreadPool(2) predict_result = pool.map(get_score, f_models) pool.close() pool.join() #print(results) #print("Get score ", time.time() - start_time, " seconds") proba = GMMSet.softmax([i[1] for i in predict_result]) predict_result = [(predict_result[i][0], proba[i]) for i in range(len(proba))] #print("predict_result:",predict_result) # 对预测结果按分数作高到底排序 predict_result = sorted(predict_result, key=operator.itemgetter(1), reverse=True) #print("sort_predict_result:", predict_result) # 微信语音数据集的label格式 label = os.path.basename(f).split('_')[0] #[6:11] #label=os.path.basename(f).split('(')[0]#[6:11] # AISHELL数据集的label格式 # label=os.path.basename(f)[6:11] predict = predict_result[0][0] predict_score = predict_result[0][1] print("Predict ", time.time() - start_time, " seconds") # #print('Top:',predict_result[:10]) # 统计top1准确率 if label in predict: right1 += 1 print('label:', label, ' predict:', predict, ' score:', predict_score, ' top1 right') else: wrong1 += 1 print('label:', label, ' predict:', predict, ' score:', predict_score, ' top1 wrong') # 统计Top10准确率 predicts = [] predict_scores = [] for pre in predict_result[:10]: predicts.append(pre[0]) predict_scores.append(pre[1]) if label in predicts: right += 1 print('label:', label, ' predicts:', predicts, ' scores:', predict_scores, ' top10 Right') else: wrong += 1 print('label:', label, ' predicts:', predicts, ' scores:', predict_scores, ' top10 Wrong') num += 1 print('top1:', num, ' right:', right1, ' wrong:', wrong1, ' top1 acc:', right1 / num) print('top10:', num, ' right:', right, ' wrong:', wrong, ' top10 acc:', right / num)
def seg_ditail(fid, trained_model, mt_size, mt_step, st_win): """segment ditail""" st_step = st_win results = {} ### fs, signal = read_wav(fid) [_, st_features] = mt_feature_extraction(signal, fs, mt_size * fs, mt_step * fs, round(fs * st_win)) # VAD: segments = vad(st_features, st_step, smooth_window=0.5, weight=0) i = 0 delta_t = 0.4 for seg in segments: if seg[1] - seg[0] > 2*delta_t: start_seg = seg[0] end_seg = seg[0] + delta_t while start_seg < end_seg: label = trained_model.predict(fs, signal[int(start_seg * fs):int(end_seg * fs)]) print(fid, '--', [start_seg, end_seg], '->', label) # # *********** # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # os.path.basename(fid)[:-3] + "-" + str(start_seg) + "-" + # str(end_seg) + "-" + label + ".wav"), # fs, signal[int(start_seg * fs):int(end_seg * fs)]) results[i] = {"label": label, "start": start_seg, "end": end_seg} i = i + 1 start_seg = end_seg end_seg = start_seg + delta_t if start_seg + 2*delta_t < seg[1] else seg[1] else: label = trained_model.predict(fs, signal[int(seg[0] * fs):int(seg[1] * fs)]) print(fid, '--', seg, '->', label) results[i] = {"label": label, "start": seg[0], "end": seg[1]} i = i + 1 # # *********** # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # os.path.basename(fid)[:-3] + "-" + str(last) + "-" + # str(seg[0]) + "-静音.wav"), # fs, signal[int(last * fs):int(seg[0] * fs)]) # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # os.path.basename(fid)[:-3] + "-" + str(seg[0]) + "-" + # str(seg[1]) + "-" + label + ".wav"), # fs, signal[int(seg[0] * fs):int(seg[1] * fs)]) # last = seg[1] data = {"video_info": {}, "results": []} min_duration = 0.5 start_seg = results[0]["start"] end_seg = results[0]["end"] label = results[0]["label"] # k = 0 ### # test = {}### # last = 0 ### for j in range(1, i-1): if results[j]["start"] - end_seg < min_duration \ and results[j]["label"] == label: end_seg = results[j]["end"] else: if end_seg - start_seg >= 2*min_duration: data["results"].append({"start": start_seg, "end": end_seg, "speaker_id": label}) # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # str(int(k/10))+str(k%10)+os.path.basename(fid)[:-3] + "-" + str(last) + "-" + # str(start_seg) + "-静音.wav"), # fs, signal[int(last * fs):int(start_seg * # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # str(int(k / 10)) + str(k % 10) +os.path.basename(fid)[:-3] + # "-" + str(start_seg) + "-" + # str(end_seg) + "-" + label + ".wav"), # fs, signal[int(start_seg * fs):int(end_seg * fs)]) # if start_seg - last > 0.5: # test[k] = {"label": "无人声", "start": last, "end": start_seg} ### # k = k + 1 # if end_seg - start_seg > 0.5: # test[k] = {"label": label, "start": start_seg, "end": end_seg} ### # k = k + 1 # last = end_seg start_seg = results[j]["start"] end_seg = results[j]["end"] label = results[j]["label"] # test[k] = {"start": start_seg, "end": end_seg, "label": label} # with open("D:\\pro_file\\untitled\\Amber_SpeechSeparation\\test\\result_wav\\example.json", # 'w', encoding='utf-8') as fid_exam: # json.dump(test, fid_exam, ensure_ascii=False) data["results"].append({"start": start_seg, "end": end_seg, "speaker_id": label}) write_wav(os.path.join(os.path.pardir, "result", "result_wav", os.path.basename(fid)[:-4] + "-" + str(start_seg) + "-" + str(end_seg) + "-" + label + ".wav"), fs, signal[int(start_seg * fs):int(end_seg * fs)]) with open(os.path.join(os.path.pardir, "result", "test_json", os.path.basename(fid)[:-3] + "json"), 'w', encoding='utf-8') as json_file: print("..\\result\\test_json\\" + os.path.basename(fid)[:-3] + "json->Generated") json.dump(data, json_file, ensure_ascii=False)