예제 #1
0
def split_vad(frames, n: int = 3, negative_threshold: float = 0.1):
    """
    Split a sample into multiple samples based `n` size of negative VAD.

    Parameters
    ----------
    frames: List[Tuple[Frame, label]]
    n: int, optional (default=3)
        `n` size of negative VAD to assume in one subsample.
    negative_threshold: float, optional (default = 0.1)
        If `negative_threshold` is 0.1, means that, length negative samples must at least 0.1 second.

    Returns
    -------
    result : List[Frame]
    """
    grouped = group_frames(frames)
    grouped = group_frames_threshold(grouped,
                                     threshold_to_stop=negative_threshold)
    results, temp, not_activities = [], [], 0
    for no, g in enumerate(grouped):
        a = g[0]
        if not g[1]:
            not_activities += 1
        temp.append(a)
        if not_activities >= n:
            results.append(combine_frames(temp))
            temp = [g[0]]
            not_activities = 0

    if len(temp):
        results.append(combine_frames(temp))
    return results
예제 #2
0
def split_vad_duration(
    frames,
    max_duration: float = 5.0,
    negative_threshold: float = 0.1,
    silent_trail=500,
    sample_rate: int = 16000,
    use_negative_as_silent: bool = False,
):
    """
    Split a sample into multiple samples based maximum duration of voice activities.

    Parameters
    ----------
    frames: List[Tuple[Frame, label]]
    max_duration: float, optional (default = 5.0)
        Maximum duration to assume one sample combined from voice activities.
    negative_threshold: float, optional (default = 0.1)
        If `negative_threshold` is 0.1, means that, length negative samples must at least 0.1 second.
    silent_trail: int, optional (default = 500)
        If an element is not a voice activity, append with `silent_trail` frame size.
    sample_rate: int, optional (default = 16000)
        sample rate for frames.
    use_negative_as_silent: bool, optional (default = False)
        If True, will use negative VAD as silent, else, use zeros array size of `silent_trail`.

    Returns
    -------
    result : List[Frame]
    """
    grouped = group_frames(frames)
    grouped = group_frames_threshold(grouped,
                                     threshold_to_stop=negative_threshold)
    results, temp, lengths, last_silent = [], [], 0, None
    for no, g in enumerate(grouped):
        if g[1]:
            a = g[0]
        else:
            last_silent = g[0]
            if use_negative_as_silent:
                a = np.concatenate(
                    [g[0].array[:silent_trail], g[0].array[-silent_trail:]])
            else:
                a = np.zeros(shape=(silent_trail))
            a = Frame(
                array=a,
                timestamp=g[0].timestamp,
                duration=len(a) / sample_rate,
            )
        l = len(a.array) / sample_rate
        lengths += l
        temp.append(a)
        if lengths >= max_duration:
            results.append(combine_frames(temp))
            temp = [last_silent] if last_silent else []
            lengths = 0

    if len(temp):
        results.append(combine_frames(temp))
    return results
예제 #3
0
def split_vad(
    frames,
    n: int = 3,
    negative_threshold: float = 0.1,
    silent_trail: int = 500,
    sample_rate: int = 16000,
    use_negative_as_silent: bool = False,
):
    """
    Split a sample into multiple samples based `n` size of negative VAD.

    Parameters
    ----------
    frames: List[Tuple[Frame, label]]
    n: int, optional (default=3)
        `n` size of negative VAD to assume in one subsample.
    negative_threshold: float, optional (default = 0.1)
        If `negative_threshold` is 0.1, means that, length negative samples must at least 0.1 second.
    silent_trail: int, optional (default = 500)
        If an element is not a voice activity, append with `silent_trail` frame size. 
    sample_rate: int, optional (default = 16000)
        sample rate for frames.
    use_negative_as_silent: bool, optional (default = False)
        If True, will use negative VAD as silent, else, use zeros array size of `silent_trail`.

    Returns
    -------
    result : List[Frame]
    """
    grouped = group_frames(frames)
    grouped = group_frames_threshold(grouped,
                                     threshold_to_stop=negative_threshold)
    results, temp, not_activities = [], [], 0
    for no, g in enumerate(grouped):
        if g[1]:
            a = g[0]
        else:
            not_activities += 1
            if use_negative_as_silent:
                a = np.concatenate(
                    [g[0].array[:silent_trail], g[0].array[-silent_trail:]])
            else:
                a = np.zeros(shape=(silent_trail))
            a = Frame(
                array=a,
                timestamp=g[0].timestamp,
                duration=len(a) / sample_rate,
            )
        temp.append(a)
        if not_activities >= n:
            results.append(combine_frames(temp))
            temp = [g[0]]
            not_activities = 0

    if len(temp):
        results.append(combine_frames(temp))
    return results
예제 #4
0
def split_vad_duration(
    frames,
    max_duration: float = 5.0,
    negative_threshold: float = 0.1,
    sample_rate: int = 16000,
):
    """
    Split a sample into multiple samples based maximum duration of voice activities.

    Parameters
    ----------
    frames: List[Tuple[Frame, label]]
    max_duration: float, optional (default = 5.0)
        Maximum duration to assume one sample combined from voice activities.
    negative_threshold: float, optional (default = 0.1)
        If `negative_threshold` is 0.1, means that, length negative samples must at least 0.1 second.
    sample_rate: int, optional (default = 16000)
        sample rate for frames.

    Returns
    -------
    result : List[Frame]
    """
    grouped = group_frames(frames)
    grouped = group_frames_threshold(grouped,
                                     threshold_to_stop=negative_threshold)
    results, temp, lengths = [], [], 0
    for no, g in enumerate(grouped):
        a = g[0]
        l = len(a.array) / sample_rate
        lengths += l
        temp.append(a)
        if lengths >= max_duration:
            results.append(combine_frames(temp))
            temp = []
            lengths = 0

    if len(temp):
        results.append(combine_frames(temp))
    return results