def __init__( self, corpus_name: str, base_directory: Path, base_source_url_or_directory: str = "ketos:/projects/korpora/speech/", umlaut_decoder: Callable[[str], str] = UmlautDecoder.quote_before_umlaut, tar_gz_extension: str = ".tgz", mel_frequency_count: int = 128, root_compressed_directory_name_to_skip: Optional[str] = None, subdirectory_depth: int = 2, tags_to_ignore: Iterable[str] = _tags_to_ignore, id_filter_regex=re.compile('[\s\S]*'), training_test_split: Callable[[List[LabeledExample]], Tuple[ List[LabeledExample], List[LabeledExample]]] = TrainingTestSplit. randomly_grouped_by_directory()): self.umlaut_decoder = umlaut_decoder log("Parsing corpus {}...".format(corpus_name)) super().__init__( base_directory=base_directory, base_source_url_or_directory=base_source_url_or_directory, corpus_name=corpus_name, tar_gz_extension=tar_gz_extension, root_compressed_directory_name_to_skip= root_compressed_directory_name_to_skip, subdirectory_depth=subdirectory_depth, allowed_characters=german_frequent_characters, tags_to_ignore=tags_to_ignore, id_filter_regex=id_filter_regex, mel_frequency_count=mel_frequency_count, training_test_split=training_test_split, maximum_example_duration_in_s=35, minimum_duration_per_character=2 * 2 * 128 / 16000)
def __init__(self, training_examples: List[LabeledExample], test_examples: List[LabeledExample], sampled_training_example_count: Optional[int] = None): self.training_examples = training_examples if sampled_training_example_count is None else \ random.Random(42).sample(training_examples, sampled_training_example_count) self.sampled_training_example_count = sampled_training_example_count self.test_examples = test_examples self.examples = training_examples + test_examples log("Training on {} examples, testing on {} examples.".format( len(self.training_examples), len(self.test_examples))) duplicate_training_ids = duplicates(e.id for e in training_examples) if len(duplicate_training_ids) > 0: raise ValueError("Duplicate ids in training examples: {}".format( duplicate_training_ids)) duplicate_test_ids = duplicates(e.id for e in test_examples) if len(duplicate_test_ids) > 0: raise ValueError("Duplicate ids in test examples: {}".format( duplicate_test_ids)) overlapping_ids = duplicates(e.id for e in self.examples) if len(overlapping_ids) > 0: raise ValueError("Overlapping training and test set: {}".format( overlapping_ids))
def _load_from_cache(self): try: return numpy.load(str(self.spectrogram_cache_file)) except ValueError: log("Recalculating cached file {} because loading failed.".format( self.spectrogram_cache_file)) return self._calculate_and_save_spectrogram()
def validate_to_csv( model_name: str, last_epoch: int, configuration: Configuration = Configuration.german(), step_count=10, first_epoch: int = 0, csv_directory: Path = configuration.default_data_directories. test_results_directory ) -> List[Tuple[int, ExpectationsVsPredictionsInGroupedBatches]]: step_size = (last_epoch - first_epoch) / (step_count - 1) epochs = distinct( list( int(first_epoch + index * step_size) for index in range(step_count))) log("Testing model {} on epochs {}.".format(model_name, epochs)) model = configuration.load_model( model_name, last_epoch, allowed_characters_for_loaded_model=configuration. allowed_characters, use_kenlm=True, language_model_name_extension="-incl-trans") def get_result( epoch: int) -> ExpectationsVsPredictionsInGroupedBatches: log("Testing epoch {}.".format(epoch)) model.load_weights( allowed_characters_for_loaded_model=configuration. allowed_characters, load_model_from_directory=configuration.directories. nets_base_directory / model_name, load_epoch=epoch) return configuration.test_model_grouped_by_loaded_corpus_name( model) results_with_epochs = [] csv_file = csv_directory / "{}.csv".format(model_name + "-incl-trans") import csv with csv_file.open('w', encoding='utf8') as opened_csv: writer = csv.writer(opened_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for epoch in epochs: result = get_result(epoch) writer.writerow((epoch, result.average_loss, result.average_letter_error_rate, result.average_word_error_rate, result.average_letter_error_count, result.average_word_error_count)) return results_with_epochs
def test_and_predict_batches_with_log( self, corpus_name: str, batches: Iterable[List[LabeledSpectrogram]] ) -> ExpectationsVsPredictionsInBatches: result = self.test_and_predict_batches(batches) log("{}: {}".format(corpus_name, result)) return result
def test_and_predict_batch_with_log( self, index: int, batch: List[LabeledSpectrogram]) -> ExpectationsVsPredictions: result = self.test_and_predict_batch(batch) log(str(result) + " (batch {})".format(index)) return result
def _download_if_not_yet_done(self, source_path_or_url: str, target_path: Path) -> Path: if not target_path.is_file(): log("Downloading corpus {} to {}".format(source_path_or_url, target_path)) if self.base_url_or_directory.startswith("http"): request.urlretrieve(source_path_or_url, str(target_path)) else: try: subprocess.check_output(["scp", source_path_or_url, str(target_path)], stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise IOError("Copying failed: " + str(e.output)) return target_path
def get_result( epoch: int) -> ExpectationsVsPredictionsInGroupedBatches: log("Testing epoch {}.".format(epoch)) model.load_weights( allowed_characters_for_loaded_model=configuration. allowed_characters, load_model_from_directory=configuration.directories. nets_base_directory / model_name, load_epoch=epoch) return configuration.test_model_grouped_by_loaded_corpus_name( model)
def train_transfer_from_best_english_model(self, frozen_layer_count: int, reinitialize_trainable_loaded_layers: bool = False): run_name = timestamp() + "-adam-small-learning-rate-transfer-to-{}-freeze-{}{}{}".format( self.name, frozen_layer_count, "-reinitialize" if reinitialize_trainable_loaded_layers else "", self.sampled_training_example_count_extension()) log("Run: " + run_name) wav2letter = self.load_best_english_model( frozen_layer_count=frozen_layer_count, reinitialize_trainable_loaded_layers=reinitialize_trainable_loaded_layers) self.train(wav2letter, run_name=run_name)
def test_model_grouped_by_loaded_corpus_name(self, wav2letter) -> ExpectationsVsPredictionsInGroupedBatches: def corpus_name(example: LabeledExampleFromFile) -> str: return example.audio_directory.relative_to(self.corpus_directory).parts[0] corpus_by_name = self.corpus.grouped_by(corpus_name) log([(name, len(corpus.test_examples)) for name, corpus in corpus_by_name.items()]) result = wav2letter.test_and_predict_grouped_batches(OrderedDict( (corpus_name, self.batch_generator_for_corpus(corpus).test_batches()) for corpus_name, corpus in corpus_by_name.items())) log(result) return result
def test(self): l1 = LoggedRun(lambda: log("1"), "test1", Path()) l1() self.assertEqual("1\n", l1.result_file.read_text()) l2 = LoggedRun(lambda: log("2"), "test2", Path()) l2() self.assertEqual("1\n", l1.result_file.read_text()) self.assertEqual("2\n", l2.result_file.read_text()) l1.result_file.unlink() l2.result_file.unlink()
def _extract_positional_label_by_id( self, files: Iterable[Path]) -> Dict[str, Union[PositionalLabel, str]]: json_ending = "_annot.json" json_annotation_files = \ [file for file in files if file.name.endswith(json_ending) and self.id_filter_regex.match(file.name[:-len(json_ending)])] json_extracted = OrderedDict( (file.name[:-len(json_ending)], self._extract_positional_label_from_json(file)) for file in json_annotation_files) par_annotation_files = [ file for file in files if file.name.lower().endswith(".par") and self.id_filter_regex.match(name_without_extension(file).lower()) ] extracted = OrderedDict( (name_without_extension(file), self._extract_label_from_par(file)) for file in par_annotation_files) for key in set(extracted.keys()).intersection( set(json_extracted.keys())): json = json_extracted[key] json_label = json if isinstance(json, str) else json.label if extracted[key] != json_label: log('{}: "{}" extracted from par differ from json "{}"'.format( key, extracted[key], json_label)) # json has positional information and overrides par extracted.update(json_extracted) # TODO refactor if "ALC" in self.corpus_name: # exactly half have no label: can be fixed by using 0061006007_h_00.par or _annot.json instead of 0061006007_m_00_annot.json etc. correctly_labeled_id_marker = "_h_" empty_labeled_id_marker = "_m_" correct_ids = [ id for id in extracted.keys() if correctly_labeled_id_marker in id ] for correct_id in correct_ids: empty_labeled_id = correct_id.replace( correctly_labeled_id_marker, empty_labeled_id_marker) extracted[empty_labeled_id] = extracted[correct_id] return extracted
def merge_consecutive_ranges( ranges: List[Tuple[int, int]]) -> Tuple[int, int]: def is_not_empty(range: Tuple[int, int]): return range[0] + 1 != range[1] s = sorted((range for range in ranges if is_not_empty(range)), key=lambda range: range[0])[:-1] for index, range in enumerate(s): next_range = ranges[index + 1] if range[1] != next_range[0]: log("Ranges {} of a word are not consecutive.".format( s)) return ranges[0][0], ranges[-1][1]
def fill_cache(self, repair_incorrect: bool = False) -> None: with Pool(processes=multiprocessing.cpu_count()) as pool: total = len(self.labeled_spectrograms) not_yet_cached = [ s for s in self.labeled_spectrograms if not s.is_cached() ] to_calculate = self.labeled_spectrograms if repair_incorrect else not_yet_cached log("Filling cache with {} spectrograms: {} already cached, {} to calculate." .format(total, total - len(not_yet_cached), len(to_calculate))) for index, labeled_spectrogram in enumerate(to_calculate): pool.apply_async( _repair_cached_spectrogram_if_incorrect if repair_incorrect else _cache_spectrogram, (labeled_spectrogram, )) pool.close() pool.join()
def indices_to_load_by_target_index( allowed_characters_for_loaded_model: List[chr], allowed_characters: List[chr]) -> List[Optional[int]]: load_character_set = set(allowed_characters_for_loaded_model) target_character_set = set(allowed_characters) ignored = load_character_set - target_character_set if ignored: log("Ignoring characters {} from loaded model.".format( sorted(ignored))) extra = target_character_set - load_character_set if extra: log("Initializing extra characters {} not found in model.".format( sorted(extra))) def character_index_to_load(target_character: chr) -> Optional[int]: return single_or_none([ index for index, character in enumerate( allowed_characters_for_loaded_model) if character == target_character ]) character_mapping = [ character_index_to_load(character) for character in allowed_characters ] log("Character mapping: {}".format(character_mapping)) return character_mapping
def train(self, labeled_spectrogram_batches: Iterable[List[LabeledSpectrogram]], preview_labeled_spectrogram_batch: List[LabeledSpectrogram], tensor_board_log_directory: Path, net_directory: Path, batches_per_epoch: int): print_preview_batch = lambda: log( self.test_and_predict_batch(preview_labeled_spectrogram_batch)) print_preview_batch() self.loss_net.fit_generator( self._loss_inputs_generator(labeled_spectrogram_batches), epochs=100000000, steps_per_epoch=batches_per_epoch, callbacks=self.create_callbacks( callback=print_preview_batch, tensor_board_log_directory=tensor_board_log_directory, net_directory=net_directory), initial_epoch=self.load_epoch if (self.load_epoch is not None) else 0)
freeze8 = ( "20170525-181412-adam-small-learning-rate-transfer-to-German-freeze-8", 1924) freeze8_100h = ( "20170525-181449-adam-small-learning-rate-transfer-to-German-freeze-8-50000examples", 1966) freeze8_20h = ( "20170525-181524-adam-small-learning-rate-transfer-to-German-freeze-8-10000examples", 2033) if gethostname() == "ketos": ketos_spectrogram_cache_base_directory = configuration.default_data_directories.data_directory / "ketos-spectrogram-cache" ketos_kenlm_base_directory = configuration.default_data_directories.data_directory / "ketos-kenlm" log("Running on ketos, using spectrogram cache base directory {} and kenlm base directory {}" .format(ketos_spectrogram_cache_base_directory, ketos_kenlm_base_directory)) configuration.default_data_directories.spectrogram_cache_base_directory = ketos_spectrogram_cache_base_directory configuration.default_data_directories.kenlm_base_directory = ketos_kenlm_base_directory else: restrict_gpu_memory() # Configuration.german().train_from_beginning() # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=8, reinitialize_trainable_loaded_layers=True) # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=0) # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=6) # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=9) # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=10) # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=8) # Configuration.german(sampled_training_example_count_when_loading_from_cached=50000).train_transfer_from_best_english_model(frozen_layer_count=8)
def test_model(self, wav2letter): log(wav2letter.test_and_predict_batch(self.batch_generator.preview_batch())) log(wav2letter.test_and_predict_batches(self.batch_generator.test_batches()))
def summarize_and_save_corpus(self): log(self.corpus.summary()) self.corpus.summarize_to_csv(self.corpus_directory / "summary.csv") self.save_corpus()
def print_preview_batch(): return log( self.test_and_predict_batch(preview_labeled_spectrogram_batch))
def duration_in_s(self) -> float: try: return librosa.get_duration(filename=str(self.audio_file)) except Exception as e: log("Failed to get duration of {}: {}".format(self.audio_file, e)) return 0
def load_weights(self, allowed_characters_for_loaded_model: List[chr], load_epoch: int, load_model_from_directory: Path, loaded_first_layers_count: Optional[int] = None): if allowed_characters_for_loaded_model is None: self.predictive_net.load_weights( str(load_model_from_directory / self.model_file_name(load_epoch))) else: layer_count = len(self.predictive_net.layers) if loaded_first_layers_count is None: loaded_first_layers_count = layer_count original_wav2letter = Wav2Letter( input_size_per_time_step=self.input_size_per_time_step, allowed_characters=allowed_characters_for_loaded_model, use_raw_wave_input=self.use_raw_wave_input, activation=self.activation, output_activation=self.output_activation, optimizer=self.optimizer, dropout=self.dropout, load_model_from_directory=load_model_from_directory, load_epoch=load_epoch, frozen_layer_count=self.frozen_layer_count, use_asg=self.use_asg, asg_initial_probabilities=self.asg_initial_probabilities, asg_transition_probabilities=self.asg_transition_probabilities) log("Loading first {} layers of {}, epoch {}, reinitializing the last {}." .format(loaded_first_layers_count, load_model_from_directory, load_epoch, layer_count - loaded_first_layers_count)) for index, layer in enumerate( self.predictive_net.layers[:loaded_first_layers_count]): original_weights, original_biases = original_wav2letter.predictive_net.layers[ index].get_weights() if index == len(self.predictive_net.layers) - 1: indices_to_load_by_target_index = self.indices_to_load_by_target_index( allowed_characters_for_loaded_model, self.grapheme_encoding.allowed_characters) def get_grapheme_index_to_load(target_grapheme_index: int): if target_grapheme_index == self.grapheme_encoding.ctc_blank: return original_wav2letter.grapheme_encoding.ctc_blank return indices_to_load_by_target_index[ target_grapheme_index] original_shape = original_weights.shape def loaded_character_weights( index: Optional[int]) -> ndarray: return original_weights[:, :, index:index + 1] if index else \ zeros((original_shape[0], original_shape[1], 1)) def loaded_character_bias(index: Optional[int]) -> int: return original_biases[index] if index else 0 grapheme_indices_to_load = \ [get_grapheme_index_to_load(target_grapheme_index) for target_grapheme_index in range(self.grapheme_encoding.grapheme_set_size)] original_weights = numpy.concatenate([ loaded_character_weights(index) for index in grapheme_indices_to_load ], axis=2) original_biases = numpy.array([ loaded_character_bias(index) for index in grapheme_indices_to_load ]) layer.set_weights([original_weights, original_biases])
def create_predictive_net(self) -> Sequential: """Returns the part of the net that predicts grapheme probabilities given a spectrogram. A loss operation is not contained. As described here: https://arxiv.org/pdf/1609.03193v2.pdf """ def convolution(name: str, filter_count: int, filter_length: int, strides: int = 1, activation: str = self.activation, input_dim: int = None, never_dropout: bool = False) -> List[Layer]: return ([] if self.dropout is None or never_dropout else [ Dropout(self.dropout, input_shape=(None, input_dim), name="dropout_before_{}".format(name)) ]) + [ Conv1D(filters=filter_count, kernel_size=filter_length, strides=strides, activation=activation, name=name, input_shape=(None, input_dim), padding="same") ] main_filter_count = 250 def input_convolutions() -> List[Conv1D]: raw_wave_convolution_if_needed = convolution( "wave_conv", filter_count=main_filter_count, filter_length=250, strides=160, input_dim=self.input_size_per_time_step ) if self.use_raw_wave_input else [] return raw_wave_convolution_if_needed + convolution( "striding_conv", filter_count=main_filter_count, filter_length=48, strides=2, input_dim=None if self.use_raw_wave_input else self.input_size_per_time_step) def inner_convolutions() -> List[Conv1D]: return [ layer for i in range(1, 8) for layer in convolution("inner_conv_{}".format(i), filter_count=main_filter_count, filter_length=7) ] def output_convolutions() -> List[Conv1D]: out_filter_count = 2000 return [ layer for conv in [ convolution("big_conv_1", filter_count=out_filter_count, filter_length=32, never_dropout=True), convolution("big_conv_2", filter_count=out_filter_count, filter_length=1, never_dropout=True), convolution( "output_conv", filter_count=self.grapheme_encoding.grapheme_set_size, filter_length=1, activation=self.output_activation, never_dropout=True) ] for layer in conv ] layers = input_convolutions() + inner_convolutions( ) + output_convolutions() if self.frozen_layer_count > 0: log("All but {} layers frozen.".format( len(layers) - self.frozen_layer_count)) for layer in layers[:self.frozen_layer_count]: layer.trainable = False return Sequential(layers)