Exemplo n.º 1
0
    def __init__(self, g_pool):
        super().__init__(g_pool)

        self.play = False
        self.pa_stream = None
        self.audio_sync = 0.0
        self.audio_delay = 0.0
        self.audio_timer = None
        self.audio_frame_iterator = None
        self.audio_start_pts = 0

        # debug flag. Only set if timestamp consistency should be checked
        self.should_check_ts_consistency = False

        self.req_audio_volume = 1.0
        self.current_audio_volume = 1.0
        self.req_buffer_size_secs = 0.5
        self.audio_viz_trans = None
        self.audio_bytes_fifo = collections.deque()

        try:
            self.audio_all = load_audio(self.g_pool.rec_dir)
        except NoAudioLoadedError:
            return

        self.calculate_audio_bounds()

        self.filter_graph = None
        self.filter_graph_list = None
        self.pa = pa.PyAudio()

        self._setup_input_audio_part(0)

        self._setup_output_audio()
        self._setup_audio_vis()
def process_utterance(odir, index, wav_path, text):

    #Parameters: out_dir: The directory to write the spectrograms into
    #            index: The numeric index to use in the spectrogram filenames.
    #            wav_path: Path to the audio file containing the speech input
    #            text: The text spoken in the input audio file

    #Returns:(spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt

    # Load the audio to a numpy array:
    wav = audio_utils.load_audio(wav_path)
    print(wav_path.split('\\')[-1].split('.')[0])
    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio_utils.spectrogram(wav).astype(np.float32)

    n_frames = spectrogram.shape[1]
    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio_utils.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'jeine-spec-%05d.npy' % index
    mel_filename = 'jeine-mel-%05d.npy' % index
    np.save(os.path.join(odir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(odir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemplo n.º 3
0
    def __init__(
        self,
        file_loc,
        fps=30,
        video_stream={
            "codec": "mpeg4",
            "bit_rate": 15000 * 10e3
        },
        audio_dir=None,
        use_timestamps=False,
    ):
        super().__init__()
        self.use_timestamps = use_timestamps
        self.timestamps = []
        # the approximate capture rate.
        self.fps = int(fps)
        directory, video_file = os.path.split(file_loc)
        name, ext = os.path.splitext(video_file)

        if ext not in (".mp4", ".mov", ".mkv"):
            logger.warning(
                "media file container should be mp4 or mov. Using a different container is risky."
            )

        self.file_loc = file_loc
        self.container = av.open(self.file_loc, "w")
        logger.debug("Opened '{}' for writing.".format(self.file_loc))

        if self.use_timestamps:
            self.time_base = Fraction(1, 65535)  # highest resolution for mp4
        else:
            self.time_base = Fraction(1000, self.fps * 1000)  # timebase is fps

        self.video_stream = self.container.add_stream(video_stream["codec"],
                                                      1 / self.time_base)
        self.video_stream.bit_rate = video_stream["bit_rate"]
        self.video_stream.bit_rate_tolerance = video_stream["bit_rate"] / 20
        self.video_stream.thread_count = max(1, mp.cpu_count() - 1)
        # self.video_stream.pix_fmt = "yuv420p"

        if audio_dir is not None:
            try:
                self.audio = audio_utils.load_audio(audio_dir)
                self.audio_export_stream = self.container.add_stream(
                    template=self.audio.stream)
            except audio_utils.NoAudioLoadedError:
                logger.warning("Could not mux audio. File not found.")
                self.audio = None
        else:  # audio export explicitly disabled
            self.audio = None

        self.configured = False
        self.start_time = None

        self.current_frame_idx = 0
        self.audio_packets_decoded = 0

        self._closed = False
        self._last_ts = float("-inf")
        self._last_pts = float("-inf")
Exemplo n.º 4
0
    def fill_buffer(self):
        # assert that fixed size is greater than min size
        self.current_item = next(self.stems)

        b_size = 0
        while b_size < self.buffer_min:
            # item = next(dataset_map)
            stems = self.dataset_map[self.current_item][self.kw]
            print(f'found {len(stems)} stems')

            # often each session has > 1 match for a stem
            for s in stems:
                sub_d = self.dataset_map[self.current_item][self.kw][s]
                print(sub_d)

                # load in the audio array
                audio = audio_utils.load_audio(sub_d["path"], sr=44100)
                num_samps = sub_d["num_samps"]
                indicies = sub_d["verified"]

                sliced = self.slice_tensor(audio, indicies)

                self.batch_buff.append(sliced)

                b_size += num_samps

            b_size = 0
            self.batch_buff = []
            self.current_item = next(self.stems)
def verify_classes_yamnet(verified_classmap, dataset_map, silence_thresh, kw_class, yam_approve, yam_reject=['Silence']):

  yamnet = Yamnet()
  
  # verified_classmap = {}
  cm_keys = verified_classmap.keys()

  total_sessions = len(dataset_map.keys())

  print(f'TOTAL SESSIONS : {total_sessions}')

  for i, k in enumerate(dataset_map.keys()):
    print(f'ENUMERATING KEY {i}')
    # get all matching stems for kw_class
    matching_stems = dataset_map[k][kw_class]

    if k not in cm_keys:
      verified_classmap[k] = {}

    # by default this will overwrite an existing class
    verified_classmap[k][kw_class] = {}

    print(f'verifying {len(matching_stems)} stems in {k}')

    for stem in matching_stems:
      stem_name = os.path.splitext(stem)[0]
      stem_name = os.path.basename(stem_name)
      
      # array of sample indicies 
      verified_indicies = []

      # TODO: array of yamnet matched classes
      # matched_classes = []'

      # load the stem audio
      stem_audio = audio_utils.load_audio(stem, 16000, 1)
      # return clips where audio exceeds db threshold
      clips, intervals, num_samps = audio_utils.strip_silence(stem_audio,
                                                    silence_thresh,
                                                    frame_length=2048,
                                                    hop_length=512,
                                                    min_len=4096)
                                                    
      for c, i in zip(clips, intervals):
        classes = yamnet.verify_class(c, 16000, yam_approve, yam_reject)
        if classes is not None:
          verified_indicies.append([int(i[0]), int(i[1])])
          # matched_classes.append(classes)

      # matched_classes = set(matched_classes)

      if len(verified_indicies) > 0:
        verified_classmap[k][kw_class][stem_name] = {}
        verified_classmap[k][kw_class][stem_name]["path"] = stem
        verified_classmap[k][kw_class][stem_name]["verified"] = list(verified_indicies)
        verified_classmap[k][kw_class][stem_name]["num_samps"] = int(num_samps)
        # TODO: add matched class labels to dict
        # verified_indicies[k][kw_class][stem_name]["audioset_label"] = matched_classes
  return verified_classmap
Exemplo n.º 6
0
def extract_clips(path, silence_thresh, ws=2048, hop=1024, min_len=4096):
    # load the track
    audio = audio_utils.load_audio(path, 16000, 1)
    # return clips where audio exceeds db threshold
    clips, intervals, num_samps = audio_utils.strip_silence(audio,
                                                silence_thresh,
                                                frame_length=ws,
                                                hop_length=hop,
                                                min_len=min_len)
    return clips, intervals, num_samps
Exemplo n.º 7
0
    def __init__(
        self,
        file_loc,
        fps=30,
        video_stream={"codec": "mpeg4", "bit_rate": 15000 * 10e3},
        audio_dir=None,
        use_timestamps=False,
    ):
        super().__init__()
        self.use_timestamps = use_timestamps
        self.timestamps = []
        # the approximate capture rate.
        self.fps = int(fps)
        directory, video_file = os.path.split(file_loc)
        name, ext = os.path.splitext(video_file)

        if ext not in (".mp4", ".mov", ".mkv"):
            logger.warning(
                "media file container should be mp4 or mov. Using a different container is risky."
            )

        self.file_loc = file_loc
        self.container = av.open(self.file_loc, "w")
        logger.debug("Opened '{}' for writing.".format(self.file_loc))

        if self.use_timestamps:
            self.time_base = Fraction(1, 65535)  # highest resolution for mp4
        else:
            self.time_base = Fraction(1000, self.fps * 1000)  # timebase is fps

        self.video_stream = self.container.add_stream(
            video_stream["codec"], 1 / self.time_base
        )
        self.video_stream.bit_rate = video_stream["bit_rate"]
        self.video_stream.bit_rate_tolerance = video_stream["bit_rate"] / 20
        self.video_stream.thread_count = max(1, mp.cpu_count() - 1)
        # self.video_stream.pix_fmt = "yuv420p"

        if audio_dir is not None:
            try:
                self.audio = audio_utils.load_audio(audio_dir)
                self.audio_export_stream = self.container.add_stream(
                    template=self.audio.stream
                )
            except audio_utils.NoAudioLoadedError:
                logger.warning("Could not mux audio. File not found.")
                self.audio = None
        else:  # audio export explicitly disabled
            self.audio = None

        self.configured = False
        self.start_time = None

        self.current_frame_idx = 0
        self.audio_packets_decoded = 0
Exemplo n.º 8
0
    def __init__(self, *args, audio_dir: str, **kwargs):
        super().__init__(*args, **kwargs)

        try:
            self.audio = audio_utils.load_audio(audio_dir)
            self.audio_export_stream = self.container.add_stream(
                template=self.audio.stream)
        except audio_utils.NoAudioLoadedError:
            logger.warning("Could not mux audio. File not found.")
            self.audio = None

        self.num_audio_packets_decoded = 0
        self.last_audio_pts = float("-inf")
Exemplo n.º 9
0
    def __init__(self, *args, audio_dir: str, **kwargs):
        super().__init__(*args, **kwargs)

        try:
            self.audio_parts = audio_utils.load_audio(audio_dir)
            self.audio_export_stream = type(self)._add_stream(
                container=self.container, template=self.audio_parts[0].stream)
        except audio_utils.NoAudioLoadedError:
            logger.debug("Could not mux audio. File not found.")
            self.audio_parts = None
            return

        # setup stateful packet iterator
        self.audio_packet_iterator = self.iterate_audio_packets()
Exemplo n.º 10
0
def pyav_decode(
    vid_path,
    container,
    sampling_rate,
    num_frames,
    clip_idx,
    num_clips=10,
    target_fps=30,
    aug_audio=[],
    decode_audio=True,
    num_sec=1,
    aud_sample_rate=48000,
    aud_spec_type=1,
    use_volume_jittering=False,
    use_temporal_jittering=False,
    z_normalize=False,
):
    """
    Convert the video from its original fps to the target_fps. If the video
    support selective decoding (contain decoding information in the video head),
    the perform temporal selective decoding and sample a clip from the video
    with the PyAV decoder. If the video does not support selective decoding,
    decode the entire video.
    Args:
        container (container): pyav container.
        sampling_rate (int): frame sampling rate (interval between two sampled
            frames.
        num_frames (int): number of frames to sample.
        clip_idx (int): if clip_idx is -1, perform random temporal sampling. If
            clip_idx is larger than -1, uniformly split the video to num_clips
            clips, and select the clip_idx-th video clip.
        num_clips (int): overall number of clips to uniformly sample from the
            given video.
        target_fps (int): the input video may has different fps, convert it to
            the target video fps before frame sampling.
    Returns:
        frames (tensor): decoded frames from the video. Return None if the no
            video stream was found.
        fps (float): the number of frames per second of the video.
        decode_all_video (bool): If True, the entire video was decoded.
    """
    # Try to fetch the decoding information from the video head. Some of the
    # videos does not support fetching the decoding information, for that case
    # it will get None duration.
    fps = float(container.streams.video[0].average_rate)
    frames_length = container.streams.video[0].frames
    duration = container.streams.video[0].duration

    if duration is None:
        # If failed to fetch the decoding information, decode the entire video.
        decode_all_video = True
        video_start_pts, video_end_pts = 0, math.inf
    else:
        # Perform selective decoding.
        decode_all_video = False
        start_idx, end_idx = get_start_end_idx(
            frames_length,
            sampling_rate * num_frames / target_fps * fps,
            clip_idx,
            num_clips,
        )

        timebase = duration / frames_length
        video_start_pts = int(start_idx * timebase)
        video_end_pts = int(end_idx * timebase)

    frames = None
    # If video stream was found, fetch video frames from the video.
    if container.streams.video:
        video_frames, max_pts = pyav_decode_stream(
            container,
            video_start_pts,
            video_end_pts,
            container.streams.video[0],
            {"video": 0},
        )
        container.close()

        frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
        frames = torch.as_tensor(np.stack(frames))

    # Get wav
    if decode_audio:
        try:
            # Get spectogram
            fr_sec = start_idx / fps
            spec = load_audio(
                vid_path,
                fr_sec,
                num_sec=num_sec,
                sample_rate=aud_sample_rate,
                aug_audio=aug_audio,
                aud_spec_type=aud_spec_type,
                use_volume_jittering=use_volume_jittering,
                use_temporal_jittering=use_temporal_jittering,
                z_normalize=z_normalize,
            )
        except:
            print(f"Bad audio of video: {vid_path}", flush=True)
            if spec is not None:
                print(f"Bad spec shape of {vid_path}: {spec.shape}",
                      flush=True)
            if wav is not None:
                print(f"Bad wav shape of {vid_path}: {wav.shape}", flush=True)
            return None, None, None, None
        #print(vid_path, frames.shape,  spec.shape)
        return frames, spec, fps, decode_all_video
    else:
        return frames, None, fps, decode_all_video
Exemplo n.º 11
0
    def __init__(self, g_pool):
        super().__init__(g_pool)

        self.play = False
        self.pa_stream = None
        self.audio_sync = 0.0
        self.audio_delay = 0.0
        self.next_audio_frame = None
        self.audio_start_pts = 0
        self.check_ts_consistency = False
        self.req_audio_volume = 1.0
        self.current_audio_volume = 1.0
        self.req_buffer_size_secs = 0.5
        self.audio_viz_trans = None

        try:
            self.audio = load_audio(self.g_pool.rec_dir)
        except NoAudioLoadedError:
            return

        self.audio_bytes_fifo = []
        self.next_audio_frame = self._next_audio_frame()
        self.audio_resampler = av.audio.resampler.AudioResampler(
            format=self.audio.stream.format.packed,
            layout=self.audio.stream.layout,
            rate=self.audio.stream.rate,
        )
        self.audio_paused = False
        af0, af1 = next(self.next_audio_frame), next(self.next_audio_frame)
        # Check pts

        self.audio_pts_rate = af0.samples  # af1.pts - af0.pts
        self.audio_start_pts = 0
        logger.debug("audio_pts_rate = {} start_pts = {}".format(
            self.audio_pts_rate, self.audio_start_pts))

        if self.check_ts_consistency:
            print("**** Checking stream")
            for i, af in enumerate(self.next_audio_frame):
                fnum = i + 2
                if af.samples != af0.samples:
                    print("fnum {} samples = {}".format(fnum, af.samples))
                if af.pts != self.audio_idx_to_pts(fnum):
                    print("af.pts = {} fnum = {} idx2pts = {}".format(
                        af.pts, fnum, self.audio_idx_to_pts(fnum)))
                if (self.audio.timestamps[fnum] != self.audio.timestamps[0] +
                        af.pts * self.audio.stream.time_base):
                    print(
                        "ts[0] + af.pts = {} fnum = {} timestamp = {}".format(
                            self.audio.timestamps[0] +
                            af.pts * self.audio.stream.time_base,
                            fnum,
                            self.audio.timestamps[fnum],
                        ))
            print("**** Done")
        self.seek_to_audio_frame(0)

        logger.debug(
            "Audio file format {} chans {} rate {} framesize {}".format(
                self.audio.stream.format.name,
                self.audio.stream.channels,
                self.audio.stream.rate,
                self.audio.stream.frame_size,
            ))
        self.audio_start_time = 0
        self.audio_measured_latency = -1.0
        self.last_dac_time = 0
        self.filter_graph = None
        self.filter_graph_list = None
        try:
            self.pa = pa.PyAudio()
            self.pa_stream = self.pa.open(
                format=self.pa.get_format_from_width(
                    self.audio.stream.format.bytes),
                channels=self.audio.stream.channels,
                rate=self.audio.stream.rate,
                frames_per_buffer=self.audio.stream.frame_size,
                stream_callback=self.audio_callback,
                output=True,
                start=False,
            )
            logger.debug("Audio output latency: {}".format(
                self.pa_stream.get_output_latency()))
            self.audio_sync = self.pa_stream.get_output_latency()
            self.audio_reported_latency = self.pa_stream.get_output_latency()

        except ValueError:
            self.pa_stream = None

        self.audio_timeline = None

        self.audio_viz_trans = Audio_Viz_Transform(self.g_pool.rec_dir)
        self.audio_viz_data = None
        self.log_scale = False
        self.xlim = (self.g_pool.timestamps[0], self.g_pool.timestamps[-1])
        self.ylim = (0, 210)
Exemplo n.º 12
0
    def __init__(self, g_pool):
        super().__init__(g_pool)

        self.play = False
        self.pa_stream = None
        self.audio_sync = 0.0
        self.audio_delay = 0.0
        self.next_audio_frame = None
        self.audio_start_pts = 0
        self.check_ts_consistency = False
        self.req_audio_volume = 1.0
        self.current_audio_volume = 1.0
        self.req_buffer_size_secs = 0.5
        self.audio_viz_trans = None

        try:
            self.audio = load_audio(self.g_pool.rec_dir)
        except NoAudioLoadedError:
            return

        self.audio_bytes_fifo = []
        self.next_audio_frame = self._next_audio_frame()
        self.audio_resampler = av.audio.resampler.AudioResampler(
            format=self.audio.stream.format.packed,
            layout=self.audio.stream.layout,
            rate=self.audio.stream.rate,
        )
        self.audio_paused = False
        af0, af1 = next(self.next_audio_frame), next(self.next_audio_frame)
        # Check pts

        self.audio_pts_rate = af0.samples  # af1.pts - af0.pts
        self.audio_start_pts = 0
        logger.debug(
            "audio_pts_rate = {} start_pts = {}".format(
                self.audio_pts_rate, self.audio_start_pts
            )
        )

        if self.check_ts_consistency:
            print("**** Checking stream")
            for i, af in enumerate(self.next_audio_frame):
                fnum = i + 2
                if af.samples != af0.samples:
                    print("fnum {} samples = {}".format(fnum, af.samples))
                if af.pts != self.audio_idx_to_pts(fnum):
                    print(
                        "af.pts = {} fnum = {} idx2pts = {}".format(
                            af.pts, fnum, self.audio_idx_to_pts(fnum)
                        )
                    )
                if (
                    self.audio.timestamps[fnum]
                    != self.audio.timestamps[0] + af.pts * self.audio.stream.time_base
                ):
                    print(
                        "ts[0] + af.pts = {} fnum = {} timestamp = {}".format(
                            self.audio.timestamps[0]
                            + af.pts * self.audio.stream.time_base,
                            fnum,
                            self.audio.timestamps[fnum],
                        )
                    )
            print("**** Done")
        self.seek_to_audio_frame(0)

        logger.debug(
            "Audio file format {} chans {} rate {} framesize {}".format(
                self.audio.stream.format.name,
                self.audio.stream.channels,
                self.audio.stream.rate,
                self.audio.stream.frame_size,
            )
        )
        self.audio_start_time = 0
        self.audio_measured_latency = -1.0
        self.last_dac_time = 0
        self.filter_graph = None
        self.filter_graph_list = None
        try:
            self.pa = pa.PyAudio()
            self.pa_stream = self.pa.open(
                format=self.pa.get_format_from_width(self.audio.stream.format.bytes),
                channels=self.audio.stream.channels,
                rate=self.audio.stream.rate,
                frames_per_buffer=self.audio.stream.frame_size,
                stream_callback=self.audio_callback,
                output=True,
                start=False,
            )
            logger.debug(
                "Audio output latency: {}".format(self.pa_stream.get_output_latency())
            )
            self.audio_sync = self.pa_stream.get_output_latency()
            self.audio_reported_latency = self.pa_stream.get_output_latency()

        except ValueError:
            self.pa_stream = None

        self.audio_timeline = None

        self.audio_viz_trans = Audio_Viz_Transform(self.g_pool.rec_dir)
        self.audio_viz_data = None
        self.log_scale = False
        self.xlim = (self.g_pool.timestamps[0], self.g_pool.timestamps[-1])
        self.ylim = (0, 210)