def _initialSegmentation(mfcc, wavFile, save_all, wdir): wavName = wavFile[:-4] init_diar = segmentation.init_seg(mfcc, wavName) if save_all: init_filename = os.path.join(wdir, wavName + '.i.seg') Diar.write_seg(init_filename, init_diar) return init_diar
def _linearBic(mfcc, seg_diar, thr_l, wavFile, wdir, save_all): wavName = wavFile[:-4] bicl_diar = segmentation.bic_linear(mfcc, seg_diar, thr_l, sr=False) if save_all: bicl_filename = os.path.join(wdir, wavName + '.l.seg') Diar.write_seg(bicl_filename, bicl_diar) return bicl_diar
def _gaussDiverSegmentation(mfcc, wavFile, init_diar, win_size, wdir, save_all): wavName = wavFile[:-4] seg_diar = segmentation.segmentation(mfcc, init_diar, win_size) if save_all: seg_filename = os.path.join(wdir, wavName + '.s.seg') Diar.write_seg(seg_filename, seg_diar) return seg_diar
def _viterbiDecode(mfcc, bich_diar, thr_vit, wavFile, wdir, save_all): wavName = wavFile[:-4] vit_diar = viterbi.viterbi_decoding(mfcc, bich_diar, thr_vit) if save_all: vit_filename = os.path.join(wdir, wavName + '.d.seg') Diar.write_seg(vit_filename, vit_diar) return vit_diar
def _bicAhc(mfcc, bicl_diar, thr_h, wavFile, wdir, save_all): wavName = wavFile[:-4] bic = hac_bic.HAC_BIC(mfcc, bicl_diar, thr_h, sr=False) bich_diar = bic.perform(to_the_end=True) if save_all: bichac_filename = os.path.join(wdir, wavName + '.h.seg') Diar.write_seg(bichac_filename, bich_diar) #link, data = plot_dendrogram(bic.merge, 0) return bich_diar
def create_seg_viterbi(self, cep, segment_dir): #viterbi resegmentation for file_name in os.listdir(segment_dir): diar = Diar.read_seg(os.path.join(segment_dir, file_name)) vit_diar = viterbi.viterbi_decoding(cep, diar, self.vit_penalty) Diar.write_seg( os.path.join(self.results_vit_dir, file_name + '.viterbi.{:.2f}'.format(-250)), vit_diar)
def init_seg(cep, show='empty', cluster='init'): """ Return an initial segmentation composed of one segment from the first to the last feature in *cep*. :param cep: numpy.ndarry containing MFCC :param show: the speaker of the cep :param cluster: str :return: a Diar object """ length = cep.shape[0] table_out = Diar() table_out.append(show=show, start=0, stop=length, cluster=cluster) return table_out
def create_seg_bic_hac(self, cep, segment_dir): for file_name in os.listdir(segment_dir): try: diar = Diar.read_seg(os.path.join(segment_dir, file_name)) for bic_value in np.linspace(self.bic_hac_start, self.bic_hac_end, self.bic_hac_num): bic = hac_bic.HAC_BIC(cep, diar, bic_value, sr=False) bic_hac_diar = bic.perform(to_the_end=True) Diar.write_seg(os.path.join(self.bic_hac_dir, file_name+'.bic_value.{:.2f}'.format(bic_value))\ , bic_hac_diar) except Exception as e: traceback.print_exc() continue
def hac_bic(feature_server, diar, threshold, square_root_bic=False): shows = diar.make_index(['show']) diar_out = Diar() for show in shows: cep, _ = feature_server.load(show) bic = HAC_BIC(cep, shows[show], alpha=threshold, sr=square_root_bic) diar_out += bic.perform(to_the_end=True) return diar_out
def train(self): try: init_diar = Diar.read_seg(self.input_seg) #init_diar = segmentation.self.init_seg(cep, show) init_diar.pack(50) Diar.write_seg(self.init_seg, init_diar) gd_diar = segmentation.segmentation(self.cep, init_diar, self.win_size) Diar.write_seg(self.gd_seg, gd_diar) except Exception as e: traceback.print_exec() print("initialziation fault") #performing experiment self.create_seg_bic_linear(self.cep, gd_diar) self.create_seg_bic_hac(self.cep, self.linear_bic_dir) self.create_seg_iv_AHC(self.bic_hac_dir, self.input_show) self.create_seg_viterbi(self.cep, self.hac_iv_dir)
def create_seg_iv_AHC(self, segment_dir, input_show): model_iv = ModelIV(self.model_fn) #print(segment_dir) for file_name in os.listdir(segment_dir): try: segment_diar = Diar.read_seg( os.path.join(segment_dir, file_name)) #print(segment_diar) model = self.train_ivectors(model_iv, self.mfcc_dir, file_name, segment_diar, self.input_show) scores = self.score_plda(model) for hac_value in np.linspace(self.t_min, self.t_max, self.t_num): diar_iv, _, _ = hac_iv(segment_diar, scores, threshold=hac_value) Diar.write_seg(os.path.join(self.hac_iv_dir, file_name+'.hac_value.{:.2f}'.format(hac_value))\ , diar_iv) except Exception as e: traceback.print_exc() print("There is an error over here") continue
def _split_e(smooth, diarization, split_size): """ Long segments of *diarization* are cut recursively at their points of lowest energy in order to yield segments shorter than *split_size* seconds. :param smooth: sliding means of the energy (numpy.ndarray) :param diarization: a Diarization object :param split_size: maximum size of a segment :return: a Diar object """ diarization_out = Diar() for segment in diarization: _split_seg(smooth, segment, 250, split_size, diarization_out.segments) return diarization_out
def sanity_check(cep, show, cluster='init'): """ Removes equal MFCC of *cep* and return a diarization. :param cep: numpy.ndarry containing MFCC :param show: speaker of the show :return: a dirization object """ table = Diar() # 1- diff on cep[i] - cep[i+1] # 2- sum of the n MFCC # 3- take equal values, give a boolean array b = np.sum(np.diff(cep, axis=0), axis=1) == 0 # make a xor on the boolean array, true index+1 correspond to a boundary bits = b[:-1] ^ b[1:] # convert true value into a list of feature indexes # append 0 at the beginning of the list, append the last index to the list idx = [0] + (np.arange(len(bits))[bits] + 1).tolist() + [cep.shape[0]] # for each pair of indexes (idx[i] and idx[i+1]), create a segment for i in range(0, len(idx) - 1, 2): table.append(show=show, start=idx[i], stop=idx[i + 1], cluster=cluster) return table
def decode(self, table): """ performs a Viterbi decoding of the segment given in diarization :param table: a Diar object :return: a Diar object """ # print(self.transition_probabilities) # print(self.observation) path = numpy.ones((self.nb_features, self.nb_clusters), 'int32') * -1 path[0, :] = numpy.arange(self.nb_clusters) out_diarization = Diar() for row in table: start = row['start'] stop = min(row['stop'], self.nb_features - 1) logging.debug('perform from %d to %d', start, stop) for t in range(start, stop + 1): tmp = self.observation[t - 1, :] + self.transition_probabilities self.observation[t, :] += numpy.max(tmp, axis=1) path[t, :] = numpy.argmax(tmp, axis=1) max_pos = numpy.argmax(self.observation[stop, :]) out_diarization.append(show=self.show, start=stop - 1, stop=stop, cluster=self.cluster_list[max_pos]) for t in range(stop - 1, start, -1): max_pos = path[t, max_pos] cluster = self.cluster_list[max_pos] if (out_diarization[-1]['start'] == t) and (out_diarization[-1]['cluster'] == cluster): out_diarization[-1]['start'] -= 1 else: out_diarization.append(show=self.show, start=t - 1, stop=t, cluster=cluster) out_diarization.sort() # self.observation = None return out_diarization
def segmentation(cep, diarization, win_size=250): diarization_out = Diar() for segment in diarization: l = segment.duration() # logging.info('start: ', seg['start'],'end: ', seg['stop'], 'len: ', l) if l > 2 * win_size: cep_seg = segment.seg_features(cep) tmp = div_gauss(cep_seg, show=segment['show'], win=win_size, shift=segment['start']) diarization_out.append_diar(tmp) else: diarization_out.append_seg(segment) i=0 for segment in diarization_out: segment['cluster'] = 'S'+str(i) i += 1 return diarization_out
def div_gauss(cep, show='empty', win=250, shift=0): """ Segmentation based on gaussian divergence. The segmentation detects the instantaneous change points corresponding to segment boundaries. The proposed algorithm is based on the detection of local maxima. It detects the change points through a gaussian divergence (see equation below), computed using Gaussians with diagonal covariance matrices. The left and right gaussians are estimated over a five-second window sliding along the whole signal (2.5 seconds for each gaussian, given *win* =250 features). A change point, i.e. a segment boundary, is present in the middle of the window when the gaussian divergence score reaches a local maximum. :math:`GD(s_l,s_r)=(\\mu_r-\\mu_l)^t\\Sigma_l^{-1/2}\\Sigma_r^{-1/2}(\\mu_r-\\mu_l)` where :math:`s_l` is the left segment modeled by the mean :math:`\mu_l` and the diagonal covariance matrix :math:`\\Sigma_l`, :math:`s_r` is the right segment modeled by the mean :math:`\mu_r` and the diagonal covariance matrix :math:`\\Sigma_r`. :param cep: numpy array of frames :param show: speaker of the show :param win: windows size in number of frames :return: a diarization object (s4d annotation) """ length = cep.shape[0] # start and stop of the rolling windows A start_a = win - 1 # end of NAN stop_a = length - win # start and stop of the rolling windows B start_b = win + win - 1 # end of nan + delay stop_b = length # put features in a Pandas DataFrame df = pd.DataFrame(cep) # compute rolling mean and std in the window of size win, get numpy array # mean and std have NAN at the beginning and the end of the output array #mean = pd.rolling_mean(df, win).values #std = pd.rolling_std(df, win).values r = df.rolling(window=win, center=False) mean = r.mean().values std = r.std().values # compute GD scores using 2 windows A and B dist = (np.square(mean[start_a:stop_a, :] - mean[start_b:stop_b, :]) / ( std[start_a:stop_a, :] * std[start_b:stop_b, :])).sum(axis=1) # replace missing value to match cep size dist_pad = np.lib.pad(dist, (win - 1, win), 'constant', constant_values=(dist[0], dist[-1])) # remove non-speech frame # find local maximal at + or - win size borders = scipy.signal.argrelmax(dist_pad, order=win)[0].tolist() # append the first and last borders = [0] + borders + [length] diarization_out = Diar() spk = 0 for i in range(0, len(borders) - 1): diarization_out.append(show=show, start=shift+borders[i], stop=shift+borders[i + 1], cluster='S' + str(spk)) spk += 1 return diarization_out
def pyAudioDiar(): duration, result = aS.speaker_diarization(labelFileNameSound.get(), int(labelNumberOfSpeakers.get()), lda_dim=0, plot_res=False) show = 'diarizationExample' input_show = labelFileNameSound.get() input_sad = None win_size = 250 thr_l = 2 thr_h = 3 thr_vit = -250 wdir = os.path.join('out', show) if not os.path.exists(wdir): os.makedirs(wdir) fs = get_feature_server(input_show, feature_server_type='basic') cep, _ = fs.load(show) cep.shape if input_sad is not None: init_diar = Diar.read_seg(input_sad) init_diar.pack(50) else: init_diar = segmentation.init_seg(cep, show) seg_diar = segmentation.segmentation(cep, init_diar, win_size) bicl_diar = segmentation.bic_linear(cep, seg_diar, thr_l, sr=False) bic = hac_bic.HAC_BIC(cep, bicl_diar, thr_h, sr=False) bich_diar = bic.perform(to_the_end=True) vit_diar = viterbi.viterbi_decoding(cep, bich_diar, thr_vit) resList = [] currentPosition = 0 for row in vit_diar: speakerValue = int(row[1][1:]) while currentPosition < (row[3] + row[4]): resList.append(speakerValue) currentPosition += 20 currentPosition = 0 realityList = [] realityFile = pd.read_csv(labelFileNameSegment.get(), delimiter='\t', encoding='utf-8', names=['start', 'end', 'speaker']) for index, row in realityFile.iterrows(): speakerValue = int(row['speaker'][1:]) while currentPosition < row['end']: realityList.append(int(speakerValue)) currentPosition += 0.2 plot.subplot(3, 1, 2) plot.title("s4d:") plot.plot(np.arange(0, duration, duration / len(resList)), resList, 'ro') plot.subplot(3, 1, 1) plot.title("Реальность:") plot.plot(np.arange(0, duration, duration / len(realityList)), realityList, 'bo') plot.subplot(3, 1, 3) plot.title("pyPlotAudio:") plot.plot(np.arange(0, duration, duration / len(result)), result, 'go') plot.show()
plda_seg_fn = './data/seg/train.plda.seg' rank_plda = 150 it_max_plda = 10 mfcc_plda_fn = './data/mfcc/norm_plda.h5' plda_idmap_fn = './data/mfcc/plda_idmap.h5' plda_fn = './data/model/plda_'+str(rank_tv)+'_'+str(rank_plda)+'.h5' norm_stat_fn = './data/model/norm.stat.h5' norm_fn = './data/model/norm.h5' norm_iv_fn = './data/model/norm.iv.h5' matrices_fn = './data/model/matrices.h5' model_fn = './data/model/ester_model_{}_{}_{}.h5'.format(nb_gauss, rank_tv, rank_plda) logging.info('Computing MFCC for UBM') diar_ubm = Diar.read_seg(ubm_seg_fn, normalize_cluster=True) fe = get_feature_extractor(audio_dir, 'sid') ubm_idmap = fe.save_multispeakers(diar_ubm.id_map(), output_feature_filename=mfcc_ubm_fn, keep_all=False) ubm_idmap.write_txt(ubm_idmap_fn) fs = get_feature_server(mfcc_ubm_fn, 'sid') spk_lst = ubm_idmap.rightids ubm = Mixture() ubm.EM_split(fs, spk_lst, nb_gauss, iterations=(1, 2, 2, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8), num_thread=num_thread, llk_gain=0.01) ubm.write(ubm_fn, prefix='ubm/') logging.info('Computing MFCC for TV') diar_tv = Diar.read_seg(tv_seg_fn, normalize_cluster=True)
def create_seg_bic_linear(self, cep, diar): for t1 in np.linspace(self.li_bic_p_start, self.li_bic_p_stop, self.li_bic_p_num): bicl_diar = segmentation.bic_linear(cep, diar, t1, sr=False) Diar.write_seg(self.linear_bic_seg.format(t1), bicl_diar)