def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = arguments.parse_arguments() model_args.observation_dim = 512 diarization_Model = UISRNN(model_args) diarization_Model.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = diarization_Model.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, pick=True, size=(25, 6)) p.draw() p.plot.show()
def runDiarization(showName, config): wav_path = config['PATH']['audio'] + showName + '.wav' t0 = time.time() print('showName\t\t', showName) print('Extracting features') if config.getint('GENERAL', 'performFeatureExtraction'): allData = extractFeatures( config['PATH']['audio'] + showName + config['EXTENSION']['audio'], config.getfloat('FEATURES', 'framelength'), config.getfloat('FEATURES', 'frameshift'), config.getint('FEATURES', 'nfilters'), config.getint('FEATURES', 'ncoeff')) else: allData = getFeatures(config['PATH']['features'] + showName + config['EXTENSION']['features']) nFeatures = allData.shape[0] print('shape of features:', allData.shape) print('Initial number of features\t', nFeatures) t1 = time.time() feature_t = t1 - t0 print("Time used for extracting features:", feature_t) if os.path.isfile(config['PATH']['UEM'] + showName + config['EXTENSION']['UEM']): maskUEM = readUEMfile(config['PATH']['UEM'], showName, config['EXTENSION']['UEM'], nFeatures, config.getfloat('FEATURES', 'frameshift')) else: print( 'UEM file does not exist. The complete audio content is considered.' ) maskUEM = np.ones([1, nFeatures]) if os.path.isfile(config['PATH']['SAD'] + showName + config['EXTENSION']['SAD']) and not (config.getint( 'GENERAL', 'performVAD')): maskSAD = readSADfile(config['PATH']['SAD'], showName, config['EXTENSION']['SAD'], nFeatures, config.getfloat('FEATURES', 'frameshift'), config['GENERAL']['SADformat']) else: print( 'SAD file does not exist or automatic VAD is enabled in the config. VAD is applied and saved at %s.\n' % (config['PATH']['SAD'] + showName + '.lab')) maskSAD = getSADfile(config, showName, nFeatures) t2 = time.time() SAD_t = t2 - t1 print("Time used for SAD: ", SAD_t) print('shape of SAD mask', maskSAD.shape) mask = np.logical_and(maskUEM, maskSAD) mask = mask[0][0:nFeatures] nSpeechFeatures = np.sum(mask) speechMapping = np.zeros(nFeatures) #you need to start the mapping from 1 and end it in the actual number of features independently of the indexing style #so that we don't lose features on the way speechMapping[np.nonzero(mask)] = np.arange(1, nSpeechFeatures + 1) data = allData[np.where(mask == 1)] del allData segmentTable = getSegmentTable(mask, speechMapping, config.getint('SEGMENT', 'length'), config.getint('SEGMENT', 'increment'), config.getint('SEGMENT', 'rate')) numberOfSegments = np.size(segmentTable, 0) print('Number of speech features\t', nSpeechFeatures) print('Number of segements \t', numberOfSegments) print(segmentTable[0]) #create the KBM print('Training the KBM... ') #set the window rate in order to obtain "minimumNumberOfInitialGaussians" gaussians if np.floor((nSpeechFeatures - config.getint('KBM', 'windowLength')) / config.getint( 'KBM', 'minimumNumberOfInitialGaussians')) < config.getint( 'KBM', 'maximumKBMWindowRate'): windowRate = int( np.floor( (np.size(data, 0) - config.getint('KBM', 'windowLength')) / config.getint('KBM', 'minimumNumberOfInitialGaussians'))) else: windowRate = int(config.getint('KBM', 'maximumKBMWindowRate')) print('KBM window rate:', windowRate) poolSize = np.floor( (nSpeechFeatures - config.getint('KBM', 'windowLength')) / windowRate) if config.getint('KBM', 'useRelativeKBMsize'): kbmSize = int(np.floor(poolSize * config.getfloat('KBM', 'relKBMsize'))) else: kbmSize = int(config.getint('KBM', 'kbmSize')) print('Training pool of', int(poolSize), 'gaussians with a rate of', int(windowRate), 'frames') kbm, gmPool = trainKBM(data, config.getint('KBM', 'windowLength'), windowRate, kbmSize) print('Selected', kbmSize, 'gaussians from the pool') Vg = getVgMatrix(data, gmPool, kbm, config.getint('BINARY_KEY', 'topGaussiansPerFrame')) print(Vg[0]) print('Vg shape:', Vg.shape) t3 = time.time() KBM_t = t3 - t2 print("Time used for traing KBM: ", KBM_t) print('Computing binary keys for all segments... ') segmentBKTable, segmentCVTable = getSegmentBKs( segmentTable, kbmSize, Vg, config.getfloat('BINARY_KEY', 'bitsPerSegmentFactor'), speechMapping) print(segmentBKTable.shape) print(segmentCVTable.shape) t4 = time.time() BKCV_t = t4 - t3 print("Time used to cal BK, CV: ", BKCV_t) print('Performing initial clustering... ') initialClustering = np.digitize( np.arange(numberOfSegments), np.arange(0, numberOfSegments, numberOfSegments / config.getint('CLUSTERING', 'N_init'))) print('initial clustering:', initialClustering.size) #print('initial clustering:', initialClustering) print('done') print('Performing agglomerative clustering... ') if config.getint('CLUSTERING', 'linkage'): finalClusteringTable, k = performClusteringLinkage( segmentBKTable, segmentCVTable, config.getint('CLUSTERING', 'N_init'), config['CLUSTERING']['linkageCriterion'], config['CLUSTERING']['metric']) else: finalClusteringTable, k = performClustering( speechMapping, segmentTable, segmentBKTable, segmentCVTable, Vg, config.getfloat('BINARY_KEY', 'bitsPerSegmentFactor'), kbmSize, config.getint('CLUSTERING', 'N_init'), initialClustering, config['CLUSTERING']['metric']) t5 = time.time() clustering_t = t5 - t4 print("Time used for clustering: ", clustering_t) print('Selecting best clustering...') if config['CLUSTERING_SELECTION']['bestClusteringCriterion'] == 'elbow': bestClusteringID = getBestClustering( config['CLUSTERING_SELECTION']['metric_clusteringSelection'], segmentBKTable, segmentCVTable, finalClusteringTable, k) elif config['CLUSTERING_SELECTION'][ 'bestClusteringCriterion'] == 'spectral': bestClusteringID = getSpectralClustering( config['CLUSTERING_SELECTION']['metric_clusteringSelection'], finalClusteringTable, config.getint( 'CLUSTERING', 'N_init'), segmentBKTable, segmentCVTable, k, config.getint('CLUSTERING_SELECTION', 'sigma'), config.getint('CLUSTERING_SELECTION', 'percentile'), config.getint('CLUSTERING_SELECTION', 'maxNrSpeakers')) + 1 print('Best clustering:\t', bestClusteringID.astype(int)) print( 'Number of clusters:\t', np.size( np.unique(finalClusteringTable[:, bestClusteringID.astype(int) - 1]), 0)) print(np.unique(finalClusteringTable)) print(finalClusteringTable.shape) print(np.unique(finalClusteringTable[:, bestClusteringID.astype(int) - 1])) #print('best clustering results:') #print(finalClusteringTable[:,bestClusteringID.astype(int)-1]) t6 = time.time() best_clustering_t = t6 - t5 print("Time used for best clustering: ", best_clustering_t) final_clustering = finalClusteringTable[:, bestClusteringID.astype(int) - 1] if config.getint('RESEGMENTATION', 'resegmentation') and np.size( np.unique(finalClusteringTable[:, bestClusteringID.astype(int) - 1]), 0) > 1: print(final_clustering.shape) print(final_clustering) print(segmentTable.shape) print('Performing GMM-ML resegmentation...') finalClusteringTableResegmentation, finalSegmentTable = performResegmentation( data, speechMapping, mask, finalClusteringTable[:, bestClusteringID.astype(int) - 1], segmentTable, config.getint('RESEGMENTATION', 'modelSize'), config.getint('RESEGMENTATION', 'nbIter'), config.getint('RESEGMENTATION', 'smoothWin'), nSpeechFeatures) print(finalClusteringTableResegmentation.shape) print(finalClusteringTableResegmentation) print(finalSegmentTable.shape) print('done') print(finalClusteringTableResegmentation.shape) print(finalSegmentTable.shape) print(segmentTable.shape) ''' for i in range(0, 19): print(finalClusteringTableResegmentation[i],finalSegmentTable[i],finalSegmentTable[i][0]-finalSegmentTable[i][2]) ''' t7 = time.time() reseg_t = t7 - t6 print("Time used for resegmentation: ", reseg_t) tu = t7 - t0 print('Total time used:', tu) getSegmentationFile(config['OUTPUT']['format'], config.getfloat('FEATURES', 'frameshift'), finalSegmentTable, np.squeeze(finalClusteringTableResegmentation), showName, config['EXPERIMENT']['name'], config['PATH']['output'], config['EXTENSION']['output']) t1 = time.time() y, sr = librosa.load(wav_path, sr=None) audio_duration = librosa.get_duration(y, sr=sr) print('load data: ', time.time() - t1) print('audio duration: ', audio_duration) print('real-time factor: ', tu / audio_duration) print(wav_path) print(feature_t, SAD_t, KBM_t, clustering_t, reseg_t, tu) #wav_path = './audio_test/2.wav' #print(config['PATH']['audio']+showName+'.wav') speakerSlice = getSegResultForPlot( config.getfloat('FEATURES', 'frameshift'), finalSegmentTable, np.squeeze(finalClusteringTableResegmentation)) else: clustering = rearrangeClusterID(final_clustering) #getSegmentationFile(config['OUTPUT']['format'],config.getfloat('FEATURES','frameshift'),segmentTable, finalClusteringTable[:,bestClusteringID.astype(int)-1], showName, config['EXPERIMENT']['name'], config['PATH']['output'], config['EXTENSION']['output']) speakerSlice = getSegResultForPlot( config.getfloat('FEATURES', 'frameshift'), segmentTable, clustering) p = PlotDiar(map=speakerSlice, wav=wav_path, title='Binary key diarization: ' + wav_path + ', number of speakers: ' + str(len(speakerSlice)), gui=True, pick=True, size=(25, 6)) wm = p.plot.get_current_fig_manager() wm.window.state('zoomed') p.draw() p.plot.show() if config.getint('OUTPUT', 'returnAllPartialSolutions'): if not os.path.isdir(config['PATH']['output']): os.mkdir(config['PATH']['output']) outputPathInd = config['PATH']['output'] + config['EXPERIMENT'][ 'name'] + '/' + showName + '/' if not os.path.isdir(config['PATH']['output'] + config['EXPERIMENT']['name']): os.mkdir(config['PATH']['output'] + config['EXPERIMENT']['name']) if not os.path.isdir(outputPathInd): os.mkdir(outputPathInd) for i in np.arange(k): getSegmentationFile( config['OUTPUT']['format'], config.getfloat('FEATURES', 'frameshift'), segmentTable, finalClusteringTable[:, i], showName, showName + '_' + str(np.size(np.unique(finalClusteringTable[:, i]), 0)) + '_spk', outputPathInd, config['EXTENSION']['output']) print('\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
from viewer import PlotDiar speakerSlice={'1': [{'start':175*100, 'stop':200}, {'start':30, 'stop':120}], '2': [{'start':90, 'stop':130*1000}]} p = PlotDiar(map=speakerSlice, wav=r'example.wav', gui=True, size=(25, 6)) p.draw() p.plot.show()
def main(wav_path, embedding_per_second=1.0, n_classes=5994, overlap_rate=0.5, plot_results=True): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) # model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) print('intervals', intervals, len(intervals)) print('mapTable', mapTable, len(mapTable)) print('keys', keys, len(keys)) # print('mapTable, keys', mapTable, keys) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) # print('v',v.shape) #print('feats', feats.shape) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) print(feats.shape) print(inference_args) print('predicted_label', predicted_label) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms print('time_spec_rate', time_spec_rate) center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) print('speakerSlice', speakerSlice) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) print(spk, timeDicts) for tid, timeDict in enumerate(timeDicts): print(tid, timeDict) s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] print('offset', offset) s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset print('i,s,e') print(i, s, e, tid, spk) print('>>>>>', i, s, e, tid, spk) speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e speaker_assingments = [] for spk, timeDicts in speakerSlice.items(): speaker = str(spk) print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: start = timeDict['start'] end = timeDict['stop'] start = fmtTime( start) # change point moves to the center of the slice end = fmtTime(end) print(start + ' ==> ' + end) speaker_assingments.append((start, end, speaker, wav_path)) if plot_results: p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return feats, predicted_label, intervals, speaker_assingments, time_spec_rate
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4) #mapTable1,keys1 =genMap(interval1) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] # ============================================================================= # for spec1 in specs1: # spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) # v = network_eval.predict(spec1) # feats += [v] # ============================================================================= feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] #print(len(feats),'00000000') #predicted_label = uisrnnModel.predict(feats, inference_args) #silhoutte score # ============================================================================= # sli=[] # fromsel=[] # li=[] # knum=[] # for i in range(10): # li=[] # range_n_clusters = list (range(2,5)) # for n_clusters in range_n_clusters: # clusterer = KMeans(n_clusters=n_clusters) # preds = clusterer.fit_predict(feats) # centers = clusterer.cluster_centers_ # # score = silhouette_score (feats, preds, metric='euclidean') # print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score)) # li.append([n_clusters,score,clusterer,centers]) # # ============================================================================= # # print([float(str(i[1])[:4]) for i in li]) # # kvalue=(max([float(str(i[1])[:4]) for i in li])) # # for i in range(len(li)): # # if kvalue==float(str(li[i][1])[:4]): # # true_k=li[i][0] # # break # # ============================================================================= # maxi=li[0][1] # for i in range(1,len(li)): # if li[i][1]-maxi>=0.005: # maxi=li[i][1] # for i in li: # if i[1]==maxi: # true_k=i[0] # # ============================================================================= # # maxi=max([i[1] for i in li]) # # for i in li: # # if i[1]==maxi: # # true_k=i[0] # # ============================================================================= # fromsel.append(li[true_k-2]) # print(true_k) # knum.append(true_k) # kval=(max(set(knum), key=knum.count)) # print(kval) # ============================================================================= clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.95, gaussian_blur_sigma=1) predicted_label = clusterer.predict(feats) # ============================================================================= # clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0) # clusters.fit(feats) # tsne = TSNEVisualizer() # tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_]) # tsne.poof() # ============================================================================= global no_speakers no_speakers = len(set(predicted_label)) #print(predicted_label,'**************************') time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show()
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5, retain_audio_clip=False): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] get_transcript(str(spk), s, e) result = print_transcipt() try: for item in result: start = fmtTime(item[1]) end = fmtTime(item[2]) file = open(os.path.join(dir_name, 'FinalTranscript.txt'), 'a') transcription = f"{start} ==> {end}: [Speaker : {item[0]}] : {item[3]}" print(transcription) file.write(transcription) except Exception as exp: print(f"Failed in main() while writing to file with exception {exp}") finally: file.close() if not retain_audio_clip: shutil.rmtree(dir_name) else: print( f'Audio files of transcriptions can be found in {dir_name} folder') p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return result