def _extract_positional_label_by_id( self, files: Iterable[Path]) -> Dict[str, Union[PositionalLabel, str]]: json_ending = "_annot.json" json_annotation_files = \ [file for file in files if file.name.endswith(json_ending) and self.id_filter_regex.match(file.name[:-len(json_ending)])] json_extracted = OrderedDict( (file.name[:-len(json_ending)], self._extract_positional_label_from_json(file)) for file in json_annotation_files) par_annotation_files = [ file for file in files if file.name.lower().endswith(".par") and self.id_filter_regex.match(name_without_extension(file).lower()) ] extracted = OrderedDict( (name_without_extension(file), self._extract_label_from_par(file)) for file in par_annotation_files) for key in set(extracted.keys()).intersection( set(json_extracted.keys())): json = json_extracted[key] json_label = json if isinstance(json, str) else json.label if extracted[key] != json_label: log('{}: "{}" extracted from par differ from json "{}"'.format( key, extracted[key], json_label)) # json has positional information and overrides par extracted.update(json_extracted) # TODO refactor if "ALC" in self.corpus_name: # exactly half have no label: can be fixed by using 0061006007_h_00.par or _annot.json instead of 0061006007_m_00_annot.json etc. correctly_labeled_id_marker = "_h_" empty_labeled_id_marker = "_m_" correct_ids = [ id for id in extracted.keys() if correctly_labeled_id_marker in id ] for correct_id in correct_ids: empty_labeled_id = correct_id.replace( correctly_labeled_id_marker, empty_labeled_id_marker) extracted[empty_labeled_id] = extracted[correct_id] return extracted
def __init__(self, audio_file: Path, id: Optional[str] = None, sample_rate_to_convert_to: int = 16000, label: Optional[str] = "nolabel", fourier_window_length: int = 512, hop_length: int = 128, mel_frequency_count: int = 128, label_with_tags: str = None, positional_label: Optional[PositionalLabel] = None): # The default values for hop_length and fourier_window_length are powers of 2 near the values specified in the wave2letter paper. if id is None: id = name_without_extension(audio_file) self.audio_file = audio_file super().__init__(id=id, get_raw_audio=lambda: librosa.load( str(self.audio_file), sr=self.sample_rate)[0], label=label, sample_rate=sample_rate_to_convert_to, fourier_window_length=fourier_window_length, hop_length=hop_length, mel_frequency_count=mel_frequency_count, label_with_tags=label_with_tags, positional_label=positional_label)
def move_incorrect_cached_file_to_backup_location_and_save_error( self, error_text: str): parent_directory = Path(self.spectrogram_cache_file.parent) incorrect_cached_backup_directory = Path( parent_directory.parent / (parent_directory.name + "-incorrect")) mkdir(incorrect_cached_backup_directory) incorrect_backup_file = incorrect_cached_backup_directory / self.spectrogram_cache_file.name incorrect_backup_message_file = incorrect_cached_backup_directory / ( name_without_extension(self.spectrogram_cache_file) + "-error.txt") write_text(incorrect_backup_message_file, error_text) self.spectrogram_cache_file.rename(incorrect_backup_file)
def _extract_positional_label_by_id( self, files: Iterable[Path]) -> Dict[str, Union[PositionalLabel, str]]: xml_ending = ".xml" microphone_endings = [ "_Yamaha", "_Kinect-Beam", "_Kinect-RAW", "_Realtek", "_Samson", "_Microsoft-Kinect-Raw" ] xml_files = [ file for file in files if file.name.endswith(xml_ending) if self.id_filter_regex.match(name_without_extension(file)) ] return OrderedDict( (name_without_extension(file) + microphone_ending, self._extract_label_from_xml(file)) for file in xml_files for microphone_ending in microphone_endings if (Path(file.parent) / (name_without_extension(file) + microphone_ending + ".wav")).exists())
def example(audio_file: Path) -> LabeledExample: id = name_without_extension(audio_file) def correct_whitespace(text: str) -> str: return " ".join(text.split()).strip() def correct(label: str) -> str: return correct_whitespace(self._remove_tags_to_ignore(label)) original_positional_label = positional_label_by_id[id] has_positions = isinstance(original_positional_label, PositionalLabel) positional_label = original_positional_label.with_corrected_labels(correct).convert_range_to_seconds( LabeledExampleFromFile.file_sample_rate(audio_file)) if has_positions else None return LabeledExampleFromFile(audio_file, mel_frequency_count=self.mel_frequency_count, label=positional_label.label if has_positions else correct( original_positional_label), label_with_tags=original_positional_label.label if has_positions else original_positional_label, positional_label=positional_label)
def __init__(self, base_directory: Path, corpus_name: str, base_source_url_or_directory: str = "http://www.openslr.org/resources/12/", tar_gz_extension: str = ".tar.gz", mel_frequency_count: int = 128, root_compressed_directory_name_to_skip: Optional[ str] = "LibriSpeech/", subdirectory_depth: int = 3, allowed_characters: List[chr] = english_frequent_characters, tags_to_ignore: Iterable[str] = list(), id_filter_regex=re.compile('[\s\S]*'), training_test_split: Callable[[List[LabeledExample]], Tuple[ List[LabeledExample], List[LabeledExample]]] = TrainingTestSplit.randomly(), maximum_example_duration_in_s: Optional[int] = None, minimum_duration_per_character: Optional[float] = None): self.minimum_duration_per_character_in_s = minimum_duration_per_character self.maximum_example_duration_in_s = maximum_example_duration_in_s self.training_test_split = training_test_split self.id_filter_regex = id_filter_regex self.tags_to_ignore = tags_to_ignore self.allowed_characters = allowed_characters self.subdirectory_depth = subdirectory_depth self.root_compressed_directory_name_to_skip = root_compressed_directory_name_to_skip self.base_directory = base_directory self.base_url_or_directory = base_source_url_or_directory self.tar_gz_extension = tar_gz_extension self.mel_frequency_count = mel_frequency_count self.corpus_name = corpus_name mkdir(base_directory) self.corpus_directory = self._download_and_unpack_if_not_yet_done( corpus_name=corpus_name) directories = [self.corpus_directory] for i in range(self.subdirectory_depth): directories = [ subdirectory for directory in directories for subdirectory in directory.iterdir() if subdirectory.is_dir() ] self.files = [ file for directory in directories for file in directory.iterdir() if file.is_file() ] self.unfiltered_audio_files = [ file for file in self.files if (file.name.lower().endswith(".flac") or file.name.lower().endswith(".wav")) ] audio_files = [ file for file in self.unfiltered_audio_files if self.id_filter_regex.match(name_without_extension(file)) ] self.filtered_out_count = len( self.unfiltered_audio_files) - len(audio_files) positional_label_by_id = self._extract_positional_label_by_id( self.files) found_audio_ids = set(name_without_extension(f) for f in audio_files) found_label_ids = positional_label_by_id.keys() self.audio_ids_without_label = list(found_audio_ids - found_label_ids) self.label_ids_without_audio = list(found_label_ids - found_audio_ids) def example(audio_file: Path) -> LabeledExample: id = name_without_extension(audio_file) def correct_whitespace(text: str) -> str: return " ".join(text.split()).strip() def correct(label: str) -> str: return correct_whitespace(self._remove_tags_to_ignore(label)) original_positional_label = positional_label_by_id[id] has_positions = isinstance(original_positional_label, PositionalLabel) positional_label = original_positional_label.with_corrected_labels( correct).convert_range_to_seconds( LabeledExampleFromFile.file_sample_rate( audio_file)) if has_positions else None return LabeledExampleFromFile( audio_file, mel_frequency_count=self.mel_frequency_count, label=positional_label.label if has_positions else correct(original_positional_label), label_with_tags=original_positional_label.label if has_positions else original_positional_label, positional_label=positional_label) self.examples_with_empty_and_too_long_or_short = [ example(file) for file in audio_files if name_without_extension(file) in positional_label_by_id.keys() ] self.examples_with_too_long_or_short = [ e for e in self.examples_with_empty_and_too_long_or_short if e.label ] self.examples_with_too_short = [ e for e in self.examples_with_too_long_or_short if not self.is_too_long(e) ] examples = [ e for e in self.examples_with_too_short if not self.is_too_short(e) ] training_examples, test_examples = self.training_test_split( sorted(examples, key=lambda x: x.id)) super().__init__(training_examples=training_examples, test_examples=test_examples)