Exemplo n.º 1
0
def create_segment_profile(audio_file, duration_frames, filepath, window_len, step_size=1):
    """
    Each audio file has a number of segments. We'll slide a window through it spectrogam to create segments.
    then we create a mask for each segment with the same length corresponding to the frame number of the segment.
    For each frame of the segments that falls into the boundary of a syllable, its corresponding value in the mask
     is 1, otherwise 0.
    Each of these windowed segments is given a unique ID that is constructible from the audio file's ID and the
     timestamp
    :param step_size: how many frames is the next segment ahead of the
    :param window_len: length of the sliding window
    :param audio_file:
    :return: a dictionary of (fake) segment IDs and their corresponding audiofile, start and end indices
    """
    noverlap = window_len - step_size
    real_segments = Segment.objects.filter(audio_file=audio_file)
    real_segments_timestamps = real_segments.values_list('start_time_ms', 'end_time_ms')

    # Construct a mask for the entire audiofile, then simply slicing it into fake segments
    duration_ms = int(audio_file.length / audio_file.fs * 1000)

    mask = np.zeros((duration_frames, 1), dtype=np.float32)
    for beg, end in real_segments_timestamps:
        beg_frame = int(beg / duration_ms * duration_frames)
        end_frame = int(end / duration_ms * duration_frames)
        mask[beg_frame:end_frame, :] = 1

    nwindows, windows = split_segments(duration_frames, window_len, noverlap, incltail=False)
    profiles = {}
    for beg, end in windows:
        windowed_id = '{}_{}'.format(audio_file.id, beg)
        windowed_mask = mask[beg:end, :].tolist()
        profiles[windowed_id] = (filepath, beg, end, windowed_mask)

    return profiles
Exemplo n.º 2
0
    def test_segments_without_tail(self):
        nsegs, segs = split_segments(86, 32, 16, incltail=False)
        correct_segs = np.array([[0, 32], [16, 48], [32, 64], [48, 80]])
        correct_nsegs = len(correct_segs)

        self.assertEqual(nsegs, correct_nsegs)
        self.assertTrue((segs == correct_segs).all())
Exemplo n.º 3
0
def run_segmentation(duration_frames, psd, encoder, session, window_len, step_size=1):
    noverlap = window_len - step_size
    nwindows, windows = split_segments(duration_frames, window_len, noverlap, incltail=False)
    mask = np.zeros((duration_frames,), dtype=np.float32)
    windoweds = []
    for beg, end in windows:
        windowed = psd[:, beg:end].T
        windoweds.append(windowed)

    predicteds = encoder.predict(windoweds, session)
    for predicted, (beg, end) in zip(predicteds, windows):
        predicted_binary = predicted.reshape(window_len) > 0.5
        mask[beg: end] += predicted_binary

    threshold = window_len * 0.3
    syllable_frames = mask > threshold

    syllables = []
    current_syl = None
    opening = False
    for i in range(duration_frames - 1):
        this_frame = syllable_frames[i]
        next_frame = syllable_frames[i + 1]
        if this_frame and next_frame:
            if opening is False:
                opening = True
                current_syl = [i]
        elif this_frame and opening:
            opening = False
            current_syl.append(i)
            syllables.append(current_syl)
            current_syl = None

    return syllables, None
Exemplo n.º 4
0
def _harmonic_and_pitch(args):
    """
    Computes harmonic ratio and pitch
    """
    sig = get_sig(args)
    fs, noverlap, win_length = unroll_args(args,
                                           ['fs', 'noverlap', 'win_length'])
    siglen = len(sig)
    nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False)

    HRs = []
    F0s = []

    for i in range(nsegs):
        seg_beg, seg_end = segs[i]
        frame = sig[seg_beg:seg_end]

        M = int(np.round(0.016 * fs) - 1)
        R = np.correlate(frame, frame, mode='full')

        g = R[len(frame) - 1]
        R = R[len(frame):-1]

        # estimate m0 (as the first zero crossing of R)
        [
            a,
        ] = np.nonzero(np.diff(np.sign(R)))

        if len(a) == 0:
            m0 = len(R) - 1
        else:
            m0 = a[0]
        if M > len(R):
            M = len(R) - 1

        Gamma = np.zeros(M, dtype=np.float64)
        CSum = np.cumsum(frame**2)
        Gamma[m0:M] = R[m0:M] / (np.sqrt((g * CSum[M:m0:-1])) + eps)

        if len(Gamma) == 0:
            hr = 1.0
            f0 = 0.0
        else:
            # Find the first 3 candidates, since there's lots of noise that can distort the result if we
            # only consider the max
            blags = np.argsort(Gamma)[-3:][::-1]
            f0_candidates = fs / (blags + eps)

            # The FF should be the smallest of all candidates
            smallest_f0_index = np.argmin(f0_candidates)
            f0 = f0_candidates[smallest_f0_index]
            blag = blags[smallest_f0_index]
            hr = Gamma[blag]

        HRs.append(hr)
        F0s.append(f0)

    return np.array(HRs), np.array(F0s)
Exemplo n.º 5
0
def my_stft(sig, fs, window, noverlap, nfft):
    siglen = len(sig)
    freq_range = nfft // 2 + 1
    window_size = len(window)
    nsegs, segs = split_segments(siglen, window_size, noverlap, incltail=False)
    mat = np.ndarray((freq_range, nsegs), dtype=np.complex128)
    for i in range(nsegs):
        seg = segs[i]
        subsig = sig[seg[0]: seg[1]]
        spectrum = fft(subsig * window, nfft)
        mat[:, i] = spectrum[:freq_range]
    return mat
Exemplo n.º 6
0
def lp_coefficients(args):
    sig = get_sig(args)
    nfft, fs, noverlap, win_length, order = unroll_args(
        args, ['nfft', 'fs', 'noverlap', 'win_length', 'order'])
    hann_window = _cached_get_window('hanning', nfft)
    window = unroll_args(args, [('window', hann_window)])

    siglen = len(sig)
    nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False)

    lp_coeffs = np.zeros((order, nsegs), dtype=np.float32)
    for i in range(nsegs):
        seg_beg, seg_end = segs[i]
        frame = sig[seg_beg:seg_end]

        lp_coeffs[:, i] = lp_coefficients_frame(frame * window, order)
    return lp_coeffs
Exemplo n.º 7
0
def lpc_spectrum(args):
    sig = get_sig(args)
    nfft, fs, noverlap, win_length, order = unroll_args(
        args, ['nfft', 'fs', 'noverlap', 'win_length', 'order'])
    hann_window = _cached_get_window('hanning', nfft)
    window = unroll_args(args, [('window', hann_window)])

    siglen = len(sig)
    nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False)

    lpcs = np.zeros((nfft, nsegs), dtype=np.complex64)
    for i in range(nsegs):
        seg_beg, seg_end = segs[i]
        frame = sig[seg_beg:seg_end]

        lpcs[:, i] = lpc_spectrum_frame(frame * window, order, nfft)
    return np.log10(abs(lpcs))
def _harmonic_and_pitch(args):
    """
    Computes harmonic ratio and pitch
    """
    sig = get_sig(args)
    fs, noverlap, win_length = unroll_args(args, ['fs', 'noverlap', 'win_length'])
    siglen = len(sig)
    nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False)

    HRs = []
    F0s = []

    for i in range(nsegs):
        seg_beg, seg_end = segs[i, :]
        frame = sig[seg_beg:seg_end]

        M = np.round(0.016 * fs) - 1
        R = np.correlate(frame, frame, mode='full')

        g = R[len(frame) - 1]
        R = R[len(frame):-1]

        # estimate m0 (as the first zero crossing of R)
        [a, ] = np.nonzero(np.diff(np.sign(R)))

        if len(a) == 0:
            m0 = len(R) - 1
        else:
            m0 = a[0]
        if M > len(R):
            M = len(R) - 1

        Gamma = np.zeros(M, dtype=np.float64)
        CSum = np.cumsum(frame ** 2)
        Gamma[m0:M] = R[m0:M] / (np.sqrt((g * CSum[M:m0:-1])) + eps)

        ZCR = frame_zcr(Gamma)

        if ZCR > 0.15:
            HR = 0.0
            f0 = 0.0
        else:
            if len(Gamma) == 0:
                HR = 1.0
                blag = 0.0
                Gamma = np.zeros(M, dtype=np.float64)
            else:
                HR = np.max(Gamma)
                blag = np.argmax(Gamma)

            # Get fundamental frequency:
            f0 = fs / (blag + eps)
            if f0 > 5000:
                f0 = 0.0
            if HR < 0.1:
                f0 = 0.0

        HRs.append(HR)
        F0s.append(f0)

    return np.array(HRs), np.array(F0s)