예제 #1
0
파일: main.py 프로젝트: wuxiangli91/tf-lm
            print('Sentence-level data')

            if 'rescore' in config and 'bidirectional' in config:
                raise NotImplementedError(
                    "Rescoring with a bidirectional model is not (yet) implemented."
                )
            elif 'predict_next' in config and 'bidirectional' in config:
                raise NotImplementedError(
                    "Predicting the next word with a bidirectional model is not (yet) implemented."
                )
            elif 'debug2' in config and 'bidirectional' in config:
                raise NotImplementedError(
                    "Generating a debug2 file with a bidirectional model is not (yet) implemented."
                )

            data = lm_data.wordSentenceData(config, eval_config, TRAIN, VALID,
                                            TEST)
            all_data, vocab_size, total_length, seq_lengths = data.get_data()

            # set num_steps = total length of each (padded) sentence
            config['num_steps'] = total_length

            print('Write max length of sentence to {0}max_length'.format(
                config['save_path']))

            # write maximum sentence length to file
            max_length_f = io.open('{0}max_length'.format(config['save_path']),
                                   'w')
            max_length_f.write(u'{0}\n'.format(total_length))
            max_length_f.close()

    # rescoring with non-sentence-level LMs: prepare data sentence-level
예제 #2
0
    # word-level training, on sentence level (sentences are padded until maximum sentence length)
    elif 'per_sentence' in config:

        if 'rescore' in config:
            max_length = int(
                open('{0}max_length'.format(
                    config['trained_model'])).readlines()[0].strip())
            # set num_steps = total length of each (padded) sentence
            config['num_steps'] = max_length

            data = lm_data.wordSentenceDataRescore(config, eval_config)
            all_data, vocab_size, _ = data.get_data()

        else:
            data = lm_data.wordSentenceData(config, eval_config)
            all_data, vocab_size, total_length, seq_lengths = data.get_data()

            # set num_steps = total length of each (padded) sentence
            config['num_steps'] = total_length

            print('Write max length of sentence to {0}max_length'.format(
                config['save_path']))

            # write maximum sentence length to file
            max_length_f = open('{0}max_length'.format(config['save_path']),
                                'w')
            max_length_f.write('{0}\n'.format(total_length))
            max_length_f.close()

    # rescoring with non-sentence-level LMs