print('Sentence-level data') if 'rescore' in config and 'bidirectional' in config: raise NotImplementedError( "Rescoring with a bidirectional model is not (yet) implemented." ) elif 'predict_next' in config and 'bidirectional' in config: raise NotImplementedError( "Predicting the next word with a bidirectional model is not (yet) implemented." ) elif 'debug2' in config and 'bidirectional' in config: raise NotImplementedError( "Generating a debug2 file with a bidirectional model is not (yet) implemented." ) data = lm_data.wordSentenceData(config, eval_config, TRAIN, VALID, TEST) all_data, vocab_size, total_length, seq_lengths = data.get_data() # set num_steps = total length of each (padded) sentence config['num_steps'] = total_length print('Write max length of sentence to {0}max_length'.format( config['save_path'])) # write maximum sentence length to file max_length_f = io.open('{0}max_length'.format(config['save_path']), 'w') max_length_f.write(u'{0}\n'.format(total_length)) max_length_f.close() # rescoring with non-sentence-level LMs: prepare data sentence-level
# word-level training, on sentence level (sentences are padded until maximum sentence length) elif 'per_sentence' in config: if 'rescore' in config: max_length = int( open('{0}max_length'.format( config['trained_model'])).readlines()[0].strip()) # set num_steps = total length of each (padded) sentence config['num_steps'] = max_length data = lm_data.wordSentenceDataRescore(config, eval_config) all_data, vocab_size, _ = data.get_data() else: data = lm_data.wordSentenceData(config, eval_config) all_data, vocab_size, total_length, seq_lengths = data.get_data() # set num_steps = total length of each (padded) sentence config['num_steps'] = total_length print('Write max length of sentence to {0}max_length'.format( config['save_path'])) # write maximum sentence length to file max_length_f = open('{0}max_length'.format(config['save_path']), 'w') max_length_f.write('{0}\n'.format(total_length)) max_length_f.close() # rescoring with non-sentence-level LMs