def __init__(self, batch_items, batch_size, lang): """ Initialize BatchGenerator :param batch_items: sequence of anything. This can be a list, a DataFrame, ... :param batch_size: number of elements in each batch :param lang: language to use. This will affect the tokens used for encoding the labels """ self.batch_items = batch_items self.batch_size = batch_size self.cur_index = 0 self.tokens = get_tokens(lang)
def __init__(self, model, language, greedy): """ Initialize the decoded :param model: The trained Keras model that made the inferences :param language: language to use for decoding. This will affect the alphabet used for decoding :param greedy: whether a best-path (True) or a beam search approch (False) shall be used """ self.ctc_input = model.get_layer('ctc').input[0] self.input_data = model.get_layer('the_input').input self.test_func = K.function( [self.input_data, K.learning_phase()], [self.ctc_input]) self.greedy = greedy self.strategy = 'best-path' if greedy else 'beam search' self.tokens = get_tokens(language)
def create_model(target_dir, opt, dropouts, language): tokens = get_tokens(language) n_labels = len(tokens) + 1 # +1 for blank token! print(f'using {n_labels} labels in output layer') if args.model_path: print(f'trying to load model from {target_dir}') if not isdir(args.model_path): print(f'ERROR: directory {target_dir} does not exist!', file=sys.stderr) exit(0) model = load_keras_model(target_dir, opt) else: if dropouts: print('Creating new model with dropouts') model = deep_speech_dropout(n_features=26, n_fc=args.n_fc, n_recurrent=args.n_recurrent, n_labels=n_labels) else: print('Creating new model without dropouts') model = deep_speech_lstm(n_features=26, n_fc=args.n_fc, n_recurrent=args.n_recurrent, n_labels=n_labels) model.compile(optimizer=opt, loss=ctc) model.summary() return model
def test_decoding_german(self): int_sequence = [6, 27, 18, 28, 5, 18, 0, 29, 0, 0] tokens = get_tokens('de') decoded = decode(int_sequence, tokens) assert_that(decoded, is_('färöer ü'), 'leading/trailing spaces should be stripped')
def test_encoding_german(self): text = 'färöer ü ' tokens = get_tokens('de') encoded = encode(text, tokens) assert_that(encoded, is_([6, 27, 18, 28, 5, 18, 0, 29]), 'leading/trailing spaces should be stripped')
def test_decoding_english(self): int_sequence = [6, 15, 15, 0, 2, 1, 18, 28, 28, 0, 0] tokens = get_tokens('en') decoded = decode(int_sequence, tokens) assert_that(decoded, is_('foo bar'), 'leading/trailing spaces should be stripped')
def test_encoding_english(self): text = 'foo bar ' tokens = get_tokens('en') encoded = encode(text, tokens) assert_that(encoded, is_([6, 15, 15, 0, 2, 1, 18]), 'leading/trailing spaces should be stripped')
def test_get_tokens(self): assert_that(len(get_tokens('en')), is_(28)) assert_that(len(get_tokens('de')), is_(30))