def __init__(self, g_pool): super().__init__(g_pool) self.play = False self.pa_stream = None self.audio_sync = 0.0 self.audio_delay = 0.0 self.audio_timer = None self.audio_frame_iterator = None self.audio_start_pts = 0 # debug flag. Only set if timestamp consistency should be checked self.should_check_ts_consistency = False self.req_audio_volume = 1.0 self.current_audio_volume = 1.0 self.req_buffer_size_secs = 0.5 self.audio_viz_trans = None self.audio_bytes_fifo = collections.deque() try: self.audio_all = load_audio(self.g_pool.rec_dir) except NoAudioLoadedError: return self.calculate_audio_bounds() self.filter_graph = None self.filter_graph_list = None self.pa = pa.PyAudio() self._setup_input_audio_part(0) self._setup_output_audio() self._setup_audio_vis()
def process_utterance(odir, index, wav_path, text): #Parameters: out_dir: The directory to write the spectrograms into # index: The numeric index to use in the spectrogram filenames. # wav_path: Path to the audio file containing the speech input # text: The text spoken in the input audio file #Returns:(spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt # Load the audio to a numpy array: wav = audio_utils.load_audio(wav_path) print(wav_path.split('\\')[-1].split('.')[0]) # Compute the linear-scale spectrogram from the wav: spectrogram = audio_utils.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio_utils.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'jeine-spec-%05d.npy' % index mel_filename = 'jeine-mel-%05d.npy' % index np.save(os.path.join(odir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(odir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def __init__( self, file_loc, fps=30, video_stream={ "codec": "mpeg4", "bit_rate": 15000 * 10e3 }, audio_dir=None, use_timestamps=False, ): super().__init__() self.use_timestamps = use_timestamps self.timestamps = [] # the approximate capture rate. self.fps = int(fps) directory, video_file = os.path.split(file_loc) name, ext = os.path.splitext(video_file) if ext not in (".mp4", ".mov", ".mkv"): logger.warning( "media file container should be mp4 or mov. Using a different container is risky." ) self.file_loc = file_loc self.container = av.open(self.file_loc, "w") logger.debug("Opened '{}' for writing.".format(self.file_loc)) if self.use_timestamps: self.time_base = Fraction(1, 65535) # highest resolution for mp4 else: self.time_base = Fraction(1000, self.fps * 1000) # timebase is fps self.video_stream = self.container.add_stream(video_stream["codec"], 1 / self.time_base) self.video_stream.bit_rate = video_stream["bit_rate"] self.video_stream.bit_rate_tolerance = video_stream["bit_rate"] / 20 self.video_stream.thread_count = max(1, mp.cpu_count() - 1) # self.video_stream.pix_fmt = "yuv420p" if audio_dir is not None: try: self.audio = audio_utils.load_audio(audio_dir) self.audio_export_stream = self.container.add_stream( template=self.audio.stream) except audio_utils.NoAudioLoadedError: logger.warning("Could not mux audio. File not found.") self.audio = None else: # audio export explicitly disabled self.audio = None self.configured = False self.start_time = None self.current_frame_idx = 0 self.audio_packets_decoded = 0 self._closed = False self._last_ts = float("-inf") self._last_pts = float("-inf")
def fill_buffer(self): # assert that fixed size is greater than min size self.current_item = next(self.stems) b_size = 0 while b_size < self.buffer_min: # item = next(dataset_map) stems = self.dataset_map[self.current_item][self.kw] print(f'found {len(stems)} stems') # often each session has > 1 match for a stem for s in stems: sub_d = self.dataset_map[self.current_item][self.kw][s] print(sub_d) # load in the audio array audio = audio_utils.load_audio(sub_d["path"], sr=44100) num_samps = sub_d["num_samps"] indicies = sub_d["verified"] sliced = self.slice_tensor(audio, indicies) self.batch_buff.append(sliced) b_size += num_samps b_size = 0 self.batch_buff = [] self.current_item = next(self.stems)
def verify_classes_yamnet(verified_classmap, dataset_map, silence_thresh, kw_class, yam_approve, yam_reject=['Silence']): yamnet = Yamnet() # verified_classmap = {} cm_keys = verified_classmap.keys() total_sessions = len(dataset_map.keys()) print(f'TOTAL SESSIONS : {total_sessions}') for i, k in enumerate(dataset_map.keys()): print(f'ENUMERATING KEY {i}') # get all matching stems for kw_class matching_stems = dataset_map[k][kw_class] if k not in cm_keys: verified_classmap[k] = {} # by default this will overwrite an existing class verified_classmap[k][kw_class] = {} print(f'verifying {len(matching_stems)} stems in {k}') for stem in matching_stems: stem_name = os.path.splitext(stem)[0] stem_name = os.path.basename(stem_name) # array of sample indicies verified_indicies = [] # TODO: array of yamnet matched classes # matched_classes = []' # load the stem audio stem_audio = audio_utils.load_audio(stem, 16000, 1) # return clips where audio exceeds db threshold clips, intervals, num_samps = audio_utils.strip_silence(stem_audio, silence_thresh, frame_length=2048, hop_length=512, min_len=4096) for c, i in zip(clips, intervals): classes = yamnet.verify_class(c, 16000, yam_approve, yam_reject) if classes is not None: verified_indicies.append([int(i[0]), int(i[1])]) # matched_classes.append(classes) # matched_classes = set(matched_classes) if len(verified_indicies) > 0: verified_classmap[k][kw_class][stem_name] = {} verified_classmap[k][kw_class][stem_name]["path"] = stem verified_classmap[k][kw_class][stem_name]["verified"] = list(verified_indicies) verified_classmap[k][kw_class][stem_name]["num_samps"] = int(num_samps) # TODO: add matched class labels to dict # verified_indicies[k][kw_class][stem_name]["audioset_label"] = matched_classes return verified_classmap
def extract_clips(path, silence_thresh, ws=2048, hop=1024, min_len=4096): # load the track audio = audio_utils.load_audio(path, 16000, 1) # return clips where audio exceeds db threshold clips, intervals, num_samps = audio_utils.strip_silence(audio, silence_thresh, frame_length=ws, hop_length=hop, min_len=min_len) return clips, intervals, num_samps
def __init__( self, file_loc, fps=30, video_stream={"codec": "mpeg4", "bit_rate": 15000 * 10e3}, audio_dir=None, use_timestamps=False, ): super().__init__() self.use_timestamps = use_timestamps self.timestamps = [] # the approximate capture rate. self.fps = int(fps) directory, video_file = os.path.split(file_loc) name, ext = os.path.splitext(video_file) if ext not in (".mp4", ".mov", ".mkv"): logger.warning( "media file container should be mp4 or mov. Using a different container is risky." ) self.file_loc = file_loc self.container = av.open(self.file_loc, "w") logger.debug("Opened '{}' for writing.".format(self.file_loc)) if self.use_timestamps: self.time_base = Fraction(1, 65535) # highest resolution for mp4 else: self.time_base = Fraction(1000, self.fps * 1000) # timebase is fps self.video_stream = self.container.add_stream( video_stream["codec"], 1 / self.time_base ) self.video_stream.bit_rate = video_stream["bit_rate"] self.video_stream.bit_rate_tolerance = video_stream["bit_rate"] / 20 self.video_stream.thread_count = max(1, mp.cpu_count() - 1) # self.video_stream.pix_fmt = "yuv420p" if audio_dir is not None: try: self.audio = audio_utils.load_audio(audio_dir) self.audio_export_stream = self.container.add_stream( template=self.audio.stream ) except audio_utils.NoAudioLoadedError: logger.warning("Could not mux audio. File not found.") self.audio = None else: # audio export explicitly disabled self.audio = None self.configured = False self.start_time = None self.current_frame_idx = 0 self.audio_packets_decoded = 0
def __init__(self, *args, audio_dir: str, **kwargs): super().__init__(*args, **kwargs) try: self.audio = audio_utils.load_audio(audio_dir) self.audio_export_stream = self.container.add_stream( template=self.audio.stream) except audio_utils.NoAudioLoadedError: logger.warning("Could not mux audio. File not found.") self.audio = None self.num_audio_packets_decoded = 0 self.last_audio_pts = float("-inf")
def __init__(self, *args, audio_dir: str, **kwargs): super().__init__(*args, **kwargs) try: self.audio_parts = audio_utils.load_audio(audio_dir) self.audio_export_stream = type(self)._add_stream( container=self.container, template=self.audio_parts[0].stream) except audio_utils.NoAudioLoadedError: logger.debug("Could not mux audio. File not found.") self.audio_parts = None return # setup stateful packet iterator self.audio_packet_iterator = self.iterate_audio_packets()
def pyav_decode( vid_path, container, sampling_rate, num_frames, clip_idx, num_clips=10, target_fps=30, aug_audio=[], decode_audio=True, num_sec=1, aud_sample_rate=48000, aud_spec_type=1, use_volume_jittering=False, use_temporal_jittering=False, z_normalize=False, ): """ Convert the video from its original fps to the target_fps. If the video support selective decoding (contain decoding information in the video head), the perform temporal selective decoding and sample a clip from the video with the PyAV decoder. If the video does not support selective decoding, decode the entire video. Args: container (container): pyav container. sampling_rate (int): frame sampling rate (interval between two sampled frames. num_frames (int): number of frames to sample. clip_idx (int): if clip_idx is -1, perform random temporal sampling. If clip_idx is larger than -1, uniformly split the video to num_clips clips, and select the clip_idx-th video clip. num_clips (int): overall number of clips to uniformly sample from the given video. target_fps (int): the input video may has different fps, convert it to the target video fps before frame sampling. Returns: frames (tensor): decoded frames from the video. Return None if the no video stream was found. fps (float): the number of frames per second of the video. decode_all_video (bool): If True, the entire video was decoded. """ # Try to fetch the decoding information from the video head. Some of the # videos does not support fetching the decoding information, for that case # it will get None duration. fps = float(container.streams.video[0].average_rate) frames_length = container.streams.video[0].frames duration = container.streams.video[0].duration if duration is None: # If failed to fetch the decoding information, decode the entire video. decode_all_video = True video_start_pts, video_end_pts = 0, math.inf else: # Perform selective decoding. decode_all_video = False start_idx, end_idx = get_start_end_idx( frames_length, sampling_rate * num_frames / target_fps * fps, clip_idx, num_clips, ) timebase = duration / frames_length video_start_pts = int(start_idx * timebase) video_end_pts = int(end_idx * timebase) frames = None # If video stream was found, fetch video frames from the video. if container.streams.video: video_frames, max_pts = pyav_decode_stream( container, video_start_pts, video_end_pts, container.streams.video[0], {"video": 0}, ) container.close() frames = [frame.to_rgb().to_ndarray() for frame in video_frames] frames = torch.as_tensor(np.stack(frames)) # Get wav if decode_audio: try: # Get spectogram fr_sec = start_idx / fps spec = load_audio( vid_path, fr_sec, num_sec=num_sec, sample_rate=aud_sample_rate, aug_audio=aug_audio, aud_spec_type=aud_spec_type, use_volume_jittering=use_volume_jittering, use_temporal_jittering=use_temporal_jittering, z_normalize=z_normalize, ) except: print(f"Bad audio of video: {vid_path}", flush=True) if spec is not None: print(f"Bad spec shape of {vid_path}: {spec.shape}", flush=True) if wav is not None: print(f"Bad wav shape of {vid_path}: {wav.shape}", flush=True) return None, None, None, None #print(vid_path, frames.shape, spec.shape) return frames, spec, fps, decode_all_video else: return frames, None, fps, decode_all_video
def __init__(self, g_pool): super().__init__(g_pool) self.play = False self.pa_stream = None self.audio_sync = 0.0 self.audio_delay = 0.0 self.next_audio_frame = None self.audio_start_pts = 0 self.check_ts_consistency = False self.req_audio_volume = 1.0 self.current_audio_volume = 1.0 self.req_buffer_size_secs = 0.5 self.audio_viz_trans = None try: self.audio = load_audio(self.g_pool.rec_dir) except NoAudioLoadedError: return self.audio_bytes_fifo = [] self.next_audio_frame = self._next_audio_frame() self.audio_resampler = av.audio.resampler.AudioResampler( format=self.audio.stream.format.packed, layout=self.audio.stream.layout, rate=self.audio.stream.rate, ) self.audio_paused = False af0, af1 = next(self.next_audio_frame), next(self.next_audio_frame) # Check pts self.audio_pts_rate = af0.samples # af1.pts - af0.pts self.audio_start_pts = 0 logger.debug("audio_pts_rate = {} start_pts = {}".format( self.audio_pts_rate, self.audio_start_pts)) if self.check_ts_consistency: print("**** Checking stream") for i, af in enumerate(self.next_audio_frame): fnum = i + 2 if af.samples != af0.samples: print("fnum {} samples = {}".format(fnum, af.samples)) if af.pts != self.audio_idx_to_pts(fnum): print("af.pts = {} fnum = {} idx2pts = {}".format( af.pts, fnum, self.audio_idx_to_pts(fnum))) if (self.audio.timestamps[fnum] != self.audio.timestamps[0] + af.pts * self.audio.stream.time_base): print( "ts[0] + af.pts = {} fnum = {} timestamp = {}".format( self.audio.timestamps[0] + af.pts * self.audio.stream.time_base, fnum, self.audio.timestamps[fnum], )) print("**** Done") self.seek_to_audio_frame(0) logger.debug( "Audio file format {} chans {} rate {} framesize {}".format( self.audio.stream.format.name, self.audio.stream.channels, self.audio.stream.rate, self.audio.stream.frame_size, )) self.audio_start_time = 0 self.audio_measured_latency = -1.0 self.last_dac_time = 0 self.filter_graph = None self.filter_graph_list = None try: self.pa = pa.PyAudio() self.pa_stream = self.pa.open( format=self.pa.get_format_from_width( self.audio.stream.format.bytes), channels=self.audio.stream.channels, rate=self.audio.stream.rate, frames_per_buffer=self.audio.stream.frame_size, stream_callback=self.audio_callback, output=True, start=False, ) logger.debug("Audio output latency: {}".format( self.pa_stream.get_output_latency())) self.audio_sync = self.pa_stream.get_output_latency() self.audio_reported_latency = self.pa_stream.get_output_latency() except ValueError: self.pa_stream = None self.audio_timeline = None self.audio_viz_trans = Audio_Viz_Transform(self.g_pool.rec_dir) self.audio_viz_data = None self.log_scale = False self.xlim = (self.g_pool.timestamps[0], self.g_pool.timestamps[-1]) self.ylim = (0, 210)
def __init__(self, g_pool): super().__init__(g_pool) self.play = False self.pa_stream = None self.audio_sync = 0.0 self.audio_delay = 0.0 self.next_audio_frame = None self.audio_start_pts = 0 self.check_ts_consistency = False self.req_audio_volume = 1.0 self.current_audio_volume = 1.0 self.req_buffer_size_secs = 0.5 self.audio_viz_trans = None try: self.audio = load_audio(self.g_pool.rec_dir) except NoAudioLoadedError: return self.audio_bytes_fifo = [] self.next_audio_frame = self._next_audio_frame() self.audio_resampler = av.audio.resampler.AudioResampler( format=self.audio.stream.format.packed, layout=self.audio.stream.layout, rate=self.audio.stream.rate, ) self.audio_paused = False af0, af1 = next(self.next_audio_frame), next(self.next_audio_frame) # Check pts self.audio_pts_rate = af0.samples # af1.pts - af0.pts self.audio_start_pts = 0 logger.debug( "audio_pts_rate = {} start_pts = {}".format( self.audio_pts_rate, self.audio_start_pts ) ) if self.check_ts_consistency: print("**** Checking stream") for i, af in enumerate(self.next_audio_frame): fnum = i + 2 if af.samples != af0.samples: print("fnum {} samples = {}".format(fnum, af.samples)) if af.pts != self.audio_idx_to_pts(fnum): print( "af.pts = {} fnum = {} idx2pts = {}".format( af.pts, fnum, self.audio_idx_to_pts(fnum) ) ) if ( self.audio.timestamps[fnum] != self.audio.timestamps[0] + af.pts * self.audio.stream.time_base ): print( "ts[0] + af.pts = {} fnum = {} timestamp = {}".format( self.audio.timestamps[0] + af.pts * self.audio.stream.time_base, fnum, self.audio.timestamps[fnum], ) ) print("**** Done") self.seek_to_audio_frame(0) logger.debug( "Audio file format {} chans {} rate {} framesize {}".format( self.audio.stream.format.name, self.audio.stream.channels, self.audio.stream.rate, self.audio.stream.frame_size, ) ) self.audio_start_time = 0 self.audio_measured_latency = -1.0 self.last_dac_time = 0 self.filter_graph = None self.filter_graph_list = None try: self.pa = pa.PyAudio() self.pa_stream = self.pa.open( format=self.pa.get_format_from_width(self.audio.stream.format.bytes), channels=self.audio.stream.channels, rate=self.audio.stream.rate, frames_per_buffer=self.audio.stream.frame_size, stream_callback=self.audio_callback, output=True, start=False, ) logger.debug( "Audio output latency: {}".format(self.pa_stream.get_output_latency()) ) self.audio_sync = self.pa_stream.get_output_latency() self.audio_reported_latency = self.pa_stream.get_output_latency() except ValueError: self.pa_stream = None self.audio_timeline = None self.audio_viz_trans = Audio_Viz_Transform(self.g_pool.rec_dir) self.audio_viz_data = None self.log_scale = False self.xlim = (self.g_pool.timestamps[0], self.g_pool.timestamps[-1]) self.ylim = (0, 210)