def main(wavfile, destfile, win_size, hop_size, nfbank, zoom, eps): # load signal fs, sig = apkit.load_wav(wavfile) tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) nch, nframe, _ = tf.shape # trim freq bins nfbin = _FREQ_MAX * win_size / fs # 0-8kHz freq = np.fft.fftfreq(win_size)[:nfbin] tf = tf[:, :, :nfbin] # compute pairwise gcc on f-banks ecov = apkit.empirical_cov_mat(tf, fw=1, tw=1) fbw = apkit.mel_freq_fbank_weight(nfbank, freq, fs, fmax=_FREQ_MAX, fmin=_FREQ_MIN) fbcc = apkit.gcc_phat_fbanks(ecov, fbw, zoom, freq, eps=eps) # merge to a single numpy array, indexed by 'tpbd' # (time, pair, bank, delay) feature = np.asarray( [fbcc[(i, j)] for i in xrange(nch) for j in xrange(nch) if i < j]) feature = np.moveaxis(feature, 2, 0) # and map [-1.0, 1.0] to 16-bit integer, to save storage space dtype = np.int16 vmax = np.iinfo(dtype).max feature = (feature * vmax).astype(dtype) np.save(destfile, feature)
def __call__(self, fs, sig): tf = apkit.stft(sig, apkit.cola_hamming, self.win_size, self.hop_size, last_sample=True) min_fbin = self.min_freq * self.win_size / fs if self.max_freq >= 0: max_fbin = self.max_freq * self.win_size / fs else: max_fbin = self.win_size / 2 tf = tf[:, :, min_fbin:max_fbin] feat = np.concatenate((tf.real, tf.imag), axis=0) return feat.astype(np.float32, copy=False)
def get_fbanks_gcc(signal, fs, win_size=1024, hop_size=512, nfbank=50, zoom=25, eps=1e-8): _FREQ_MAX = 8000 _FREQ_MIN = 100 tf = apkit.stft(signal, apkit.cola_hamming, win_size, hop_size) nch, nframe, _ = tf.shape # trim freq bins nfbin = int(_FREQ_MAX * win_size / fs) # 0-8kHz freq = np.fft.fftfreq(win_size) freq = freq[:nfbin] tf = tf[:, :, :nfbin] # compute pairwise gcc on f-banks ecov = apkit.empirical_cov_mat(tf, fw=1, tw=1) fbw = apkit.mel_freq_fbank_weight(nfbank, freq, fs, fmax=_FREQ_MAX, fmin=_FREQ_MIN) fbcc = apkit.gcc_phat_fbanks(ecov, fbw, zoom, freq, eps=eps) # merge to a single numpy array, indexed by 'tpbd' # (time, pair, bank, delay) feature = np.asarray( [fbcc[(i, j)] for i in range(nch) for j in range(nch) if i < j]) feature = np.squeeze(feature, axis=0) feature = np.moveaxis(feature, 2, 0) # and map [-1.0, 1.0] to 16-bit integer, to save storage space dtype = np.int16 vmax = np.iinfo(dtype).max feature = (feature * vmax).astype(dtype) return feature
def load_cpsd(afile, win_size, hop_size): fs, sig = apkit.load_wav(afile) tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) return apkit.pairwise_cpsd(tf)
def main(infile, outdir, afunc, win_size, hop_size, block_size, block_hop, min_sc): stime = time.time() # load candidate DOAs pts = apkit.load_pts_on_sphere() pts = pts[pts[:, 2] > -0.05] # use upper half of the sphere # NOTE: alternatively use only points on the horizontal plane # pts = apkit.load_pts_horizontal(360) print('%.3fs: load points (%d)' % (time.time() - stime, len(pts)), file=sys.stderr) # compute neighbors (for peak finding) nlist = apkit.neighbor_list(pts, math.pi / 180.0 * 8.0) print('%.3fs: neighbor list' % (time.time() - stime), file=sys.stderr) # load signal fs, sig = apkit.load_wav(infile) print('%.3fs: load signal' % (time.time() - stime), file=sys.stderr) # compute delays (delay for each candidate DOA and each microphone) delays = apkit.compute_delay(_MICROPHONE_COORDINATES, pts, fs=fs) print('%.3fs: compute delays' % (time.time() - stime), file=sys.stderr) # compute empirical covariance matrix tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) max_fbin = _MAX_FREQ * win_size // fs # int assert max_fbin <= win_size // 2 tf = tf[:, :, :max_fbin] # 0-8kHz fbins = np.arange(max_fbin, dtype=float) / win_size if block_size is None: ecov = apkit.empirical_cov_mat(tf) else: ecov = apkit.empirical_cov_mat_by_block(tf, block_size, block_hop) nch, _, nblock, nfbin = ecov.shape print('%.3fs: empirical cov matrix (nfbin=%d)' % (time.time() - stime, nfbin), file=sys.stderr) # local angular spectrum function phi = afunc(ecov, delays, fbins) print('%.3fs: compute phi' % (time.time() - stime), file=sys.stderr) # find local maxima lmax = apkit.local_maxima(phi, nlist, th_phi=min_sc) print('%.3fs: find local maxima' % (time.time() - stime), file=sys.stderr) # merge predictions that have similar azimuth predicitons # NOTE: skip this step if the candinate DOAs are on the horizontal plane lmax = apkit.merge_lm_on_azimuth(phi, lmax, pts, math.pi / 180.0 * 5.0) print('%.3fs: refine local maxima' % (time.time() - stime), file=sys.stderr) # save results # each file contains the predicted angular spectrum for each frame/block # each line has five tokens: # (1) x coordinate of the candidate DOA # (2) y coordinate of the candidate DOA # (3) z coordinate of the candidate DOA # (4) angular spectrum value # (5) 1 if this is a local maximum, otherwise 0 for t in range(nblock): with open(f'{outdir}/{t:06d}', 'w') as f: for i in range(len(pts)): print('%g %g %g %g %d' % (pts[i, 0], pts[i, 1], pts[i, 2], phi[i, t], 1 if i in lmax[t] else 0), file=f) print('%.3fs: save results' % (time.time() - stime), file=sys.stderr)
def load_ncov(path, win_size, hop_size): fs, sig = apkit.load_wav(path) nfbin = _MAX_FREQ * win_size // fs # 0-8kHz tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) tf = tf[:, :, :nfbin] return apkit.cov_matrix(tf)