'w') max_length_f.write(u'{0}\n'.format(total_length)) max_length_f.close() elif 'rescore' in config or 'predict_next' in config: # use trained model for rescoring print('For rescoring: sentence per sentence') data = lm_data.charSentenceDataRescore(config, eval_config, TRAIN, VALID, TEST) all_data, vocab_size, _ = data.get_data() elif 'debug2' in config: raise NotImplementedError( "Generating a debug2 file with a character-level \ model is not implemented.") else: data = lm_data.charData(config, eval_config, TRAIN, VALID, TEST) all_data, vocab_size, _ = data.get_data() # word-level training, on sentence level (sentences are padded until maximum sentence length) elif 'per_sentence' in config: if 'char_ngram' in config: raise NotImplementedError( "Models with character n-gram input are only " "implemented on discourse level.") elif 'word_char_concat' in config: raise NotImplementedError( "Models with concatenated word and character embeddings " "as input are only implemented at discourse level.") # do not read all data at once (for large datasets/small memory)
config: dictionary containing configuration options (for training and validation) eval_config: dictionary containing configuration options (for testing) (TRAIN, VALID, TEST): tuple of booleans indicating whether we should train, validate and/or test Returns: config: dictionary containing configuration options (for training and validation) eval_config: dictionary containing configuration options (for testing) data: data object train_data: training data mapped to indices (can be single list or tuple of lists depending on the type of model) valid_data: validation data mapped to indices test_data: test data mapped to indices (TRAIN, VALID, TEST): tuple of booleans indicating whether we should train, validate and/or test ''' # character-level training, in batches (cross sentence boundaries) if 'char' in config: data = lm_data.charData(config, eval_config) all_data, vocab_size, _ = data.get_data() # word-level training, on sentence level (sentences are padded until maximum sentence length) elif 'per_sentence' in config: if 'rescore' in config: max_length = int( open('{0}max_length'.format( config['trained_model'])).readlines()[0].strip()) # set num_steps = total length of each (padded) sentence config['num_steps'] = max_length data = lm_data.wordSentenceDataRescore(config, eval_config) all_data, vocab_size, _ = data.get_data()