def sample(args): numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(vocabulary, architecture, predict_next_distribution=True) print("Restoring neural network state.") network.set_state(state) print("Building text sampler.") sys.stdout.flush() sampler = TextSampler(network) for i in range(args.num_sentences): words = sampler.generate() args.output_file.write('{}: {}\n'.format( i, ' '.join(words)))
def sample(args): numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(vocabulary, architecture, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") network.set_state(state) print("Building text sampler.") sys.stdout.flush() sampler = TextSampler(network) sequences = sampler.generate(30, args.num_sentences) for sequence in sequences: try: eos_pos = sequence.index('</s>') sequence = sequence[:eos_pos + 1] except: pass args.output_file.write(' '.join(sequence) + '\n')
def sample(args): numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") network.set_state(state) print("Building text sampler.") sys.stdout.flush() sampler = TextSampler(network) sequences = sampler.generate(30, args.num_sentences) for sequence in sequences: try: eos_pos = sequence.index('</s>') sequence = sequence[:eos_pos+1] except: pass args.output_file.write(' '.join(sequence) + '\n')
class Sampler: def __init__(self, model_path): self.model_path = model_path numpy.random.seed() theano.config.compute_test_value = 'off' with h5py.File(model_path, 'r') as self.state: print("Reading vocabulary from network state.") #sys.stdout.flush() self.vocabulary = Vocabulary.from_state(self.state) print("Number of words in vocabulary:", self.vocabulary.num_words()) print("Number of words in shortlist:", self.vocabulary.num_shortlist_words()) print("Number of word classes:", self.vocabulary.num_classes()) print("Building neural network.") #sys.stdout.flush() self.architecture = Architecture.from_state(self.state) self.network = Network(self.architecture, self.vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") self.network.set_state(self.state) print("Building text sampler.") #sys.stdout.flush() self.sampler = TextSampler(self.network) def sample(self, sen_len, sen_num): sequences = self.sampler.generate(sen_len, sen_num) ret = "" seqnum = 0 for sequence in sequences: try: eos_pos = sequence.index('</s>') sequence = sequence[:eos_pos+1] except ValueError: pass except TypeError: pass if (len(sequence) < 4): continue ret = ret + ' '.join(sequence) if (seqnum == 0): ret = "<div class=\"col-md-4\">\n" + ret elif (seqnum % 2 == 0): ret = ret + "\n</div><div class=\"col-md-4\">\n" seqnum = seqnum + 1 return ret + "\n<div>\n"
def restoreModel(path): with h5py.File(path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of words in shortlist:", vocabulary.num_shortlist_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") network.set_state(state) return network
def sample(args): """A function that performs the "theanolm sample" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: logging.info("Reading vocabulary from network state.") vocabulary = Vocabulary.from_state(state) logging.info("Number of words in vocabulary: %d", vocabulary.num_words()) logging.info("Number of words in shortlist: %d", vocabulary.num_shortlist_words()) logging.info("Number of word classes: %d", vocabulary.num_classes()) logging.info("Building neural network.") architecture = Architecture.from_state(state) default_device = get_default_device(args.default_device) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False), default_device=default_device) logging.info("Restoring neural network state.") network.set_state(state) logging.info("Building text sampler.") sampler = TextSampler(network) sequences = sampler.generate(args.sentence_length, args.num_sentences, seed_sequence=args.seed_sequence) for sequence in sequences: try: eos_pos = sequence.index('</s>') sequence = sequence[:eos_pos + 1] except ValueError: pass args.output_file.write(' '.join(sequence) + '\n')
def score(args): with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary) print("Restoring neural network state.") sys.stdout.flush() network.set_state(state) print("Building text scorer.") sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty scorer = TextScorer(network, ignore_unk, unk_penalty) print("Scoring text.") if args.output == 'perplexity': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, False) elif args.output == 'word-scores': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, True) elif args.output == 'utterance-scores': _score_utterances(args.input_file, vocabulary, scorer, args.output_file, args.log_base) else: print("Invalid output format requested:", args.output) sys.exit(1)
def score(args): """A function that performs the "theanolm score" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' logging.info("Enabled computing test values for tensor variables.") logging.warning("GpuArray backend will fail random number generation!") else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile default_device = get_default_device(args.default_device) network = Network.from_file(args.model_path, exclude_unk=args.exclude_unk, default_device=default_device) logging.info("Building text scorer.") scorer = TextScorer(network, args.shortlist, args.exclude_unk, args.profile) logging.info("Scoring text.") if args.output == 'perplexity': _score_text(args.input_file, network.vocabulary, scorer, args.output_file, args.log_base, args.subwords, False) elif args.output == 'word-scores': _score_text(args.input_file, network.vocabulary, scorer, args.output_file, args.log_base, args.subwords, True) elif args.output == 'utterance-scores': _score_utterances(args.input_file, network.vocabulary, scorer, args.output_file, args.log_base) else: print("Invalid output format requested:", args.output) sys.exit(1)
def __init__(self, model_path): self.model_path = model_path numpy.random.seed() theano.config.compute_test_value = 'off' with h5py.File(model_path, 'r') as self.state: print("Reading vocabulary from network state.") #sys.stdout.flush() self.vocabulary = Vocabulary.from_state(self.state) print("Number of words in vocabulary:", self.vocabulary.num_words()) print("Number of words in shortlist:", self.vocabulary.num_shortlist_words()) print("Number of word classes:", self.vocabulary.num_classes()) print("Building neural network.") #sys.stdout.flush() self.architecture = Architecture.from_state(self.state) self.network = Network(self.architecture, self.vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") self.network.set_state(self.state) print("Building text sampler.") #sys.stdout.flush() self.sampler = TextSampler(self.network)
def score(args): with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(vocabulary, architecture) print("Restoring neural network state.") sys.stdout.flush() network.set_state(state) print("Building text scorer.") sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty scorer = TextScorer(network, ignore_unk, unk_penalty) print("Scoring text.") if args.output == 'perplexity': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, False) elif args.output == 'word-scores': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, True) elif args.output == 'utterance-scores': _score_utterances(args.input_file, vocabulary, scorer, args.output_file, args.log_base)
def train(args): numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'a', driver='core') as state: if state.keys(): print("Reading vocabulary from existing network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) elif args.vocabulary is None: print("Constructing vocabulary from training set.") sys.stdout.flush() vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for training_file in args.training_set: training_file.seek(0) vocabulary.get_state(state) else: print("Reading vocabulary from {}.".format(args.vocabulary)) sys.stdout.flush() with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file: vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format) if args.vocabulary_format == 'classes': print("Computing class membership probabilities from " "unigram word counts.") sys.stdout.flush() vocabulary.compute_probs(args.training_set) vocabulary.get_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() if args.architecture == 'lstm300' or args.architecture == 'lstm1500': architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, 'rt', encoding='utf-8') as arch_file: architecture = Architecture.from_description(arch_file) network = Network(vocabulary, architecture, profile=args.profile) sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight print("Building text scorer.") scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = \ LinearBatchIterator(validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=None) optimization_options = { 'method': args.optimization_method, 'epsilon': args.numerical_stability_term, 'gradient_decay_rate': args.gradient_decay_rate, 'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate, 'learning_rate': args.learning_rate, 'weights': weights, 'momentum': args.momentum, 'max_gradient_norm': args.gradient_normalization, 'cost_function': args.cost, 'num_noise_samples': args.num_noise_samples, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty } logging.debug("OPTIMIZATION OPTIONS") for option_name, option_value in optimization_options.items(): if type(option_value) is list: value_str = ', '.join(str(x) for x in option_value) logging.debug("%s: [%s]", option_name, value_str) else: logging.debug("%s: %s", option_name, str(option_value)) training_options = { 'strategy': args.training_strategy, 'batch_size': args.batch_size, 'sequence_length': args.sequence_length, 'validation_frequency': args.validation_frequency, 'patience': args.patience, 'stopping_criterion': args.stopping_criterion, 'max_epochs': args.max_epochs, 'min_epochs': args.min_epochs, 'max_annealing_count': args.max_annealing_count } logging.debug("TRAINING OPTIONS") for option_name, option_value in training_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building neural network trainer.") sys.stdout.flush() if len(args.sampling) > len(args.training_set): print("You specified more sampling coefficients than training " "files.") sys.exit(1) trainer = create_trainer( training_options, optimization_options, network, vocabulary, scorer, args.training_set, args.sampling, validation_iter, state, args.profile) trainer.set_logging(args.log_interval) print("Training neural network.") sys.stdout.flush() trainer.train() if not 'layers' in state.keys(): print("The model has not been trained. No cross-validations were " "performed or training did not improve the model.") else: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def train(args): """A function that performs the "theanolm train" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' print("Enabled computing test values for tensor variables.") print("Warning: GpuArray backend will fail random number generation!") else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'a', driver='core') as state: vocabulary = _read_vocabulary(args, state) if args.num_noise_samples > vocabulary.num_classes(): print("Number of noise samples ({}) is larger than the number of " "classes. This doesn't make sense and would cause sampling " "to fail.".format(args.num_noise_samples)) sys.exit(1) num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight training_options = { 'batch_size': args.batch_size, 'sequence_length': args.sequence_length, 'validation_frequency': args.validation_frequency, 'patience': args.patience, 'stopping_criterion': args.stopping_criterion, 'max_epochs': args.max_epochs, 'min_epochs': args.min_epochs, 'max_annealing_count': args.max_annealing_count } logging.debug("TRAINING OPTIONS") for option_name, option_value in training_options.items(): logging.debug("%s: %s", option_name, str(option_value)) optimization_options = { 'method': args.optimization_method, 'epsilon': args.numerical_stability_term, 'gradient_decay_rate': args.gradient_decay_rate, 'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate, 'learning_rate': args.learning_rate, 'weights': weights, 'momentum': args.momentum, 'max_gradient_norm': args.gradient_normalization, 'cost_function': args.cost, 'num_noise_samples': args.num_noise_samples, 'noise_sharing': args.noise_sharing, 'exclude_unk': args.exclude_unk } logging.debug("OPTIMIZATION OPTIONS") for option_name, option_value in optimization_options.items(): if isinstance(option_value, list): value_str = ', '.join(str(x) for x in option_value) logging.debug("%s: [%s]", option_name, value_str) else: logging.debug("%s: %s", option_name, str(option_value)) if len(args.sampling) > len(args.training_set): print("You specified more sampling coefficients than training " "files.") sys.exit(1) print("Creating trainer.") sys.stdout.flush() trainer = Trainer(training_options, vocabulary, args.training_set, args.sampling) trainer.set_logging(args.log_interval) print("Building neural network.") sys.stdout.flush() if args.architecture == 'lstm300' or args.architecture == 'lstm1500': architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, 'rt', encoding='utf-8') as arch_file: architecture = Architecture.from_description(arch_file) network = Network(architecture, vocabulary, trainer.class_prior_probs, args.noise_dampening, default_device=args.default_device, profile=args.profile) print("Compiling optimization function.") sys.stdout.flush() optimizer = create_optimizer(optimization_options, network, profile=args.profile) if args.print_graph: print("Cost function computation graph:") theano.printing.debugprint(optimizer.gradient_update_function) trainer.initialize(network, state, optimizer) # XXX Write the model instantly back to disk. Just adds word unigram # counts. This is a temporary hack. Remove at some point. trainer.get_state(state) state.flush() # XXX if args.validation_file is not None: print("Building text scorer for cross-validation.") sys.stdout.flush() scorer = TextScorer(network, use_shortlist=True, exclude_unk=args.exclude_unk, profile=args.profile) print("Validation text:", args.validation_file.name) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = \ LinearBatchIterator(validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=args.sequence_length, map_oos_to_unk=False) trainer.set_validation(validation_iter, scorer) else: print("Cross-validation will not be performed.") validation_iter = None print("Training neural network.") sys.stdout.flush() trainer.train() if 'layers' not in state.keys(): print("The model has not been trained. No cross-validations were " "performed or training did not improve the model.") elif validation_iter is not None: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def decode(args): """A function that performs the "theanolm decode" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level, file=sys.stderr) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile if (args.lattice_format == 'kaldi') or (args.output == 'kaldi'): if args.kaldi_vocabulary is None: print("Kaldi lattice vocabulary is not given.", file=sys.stderr) sys.exit(1) default_device = get_default_device(args.default_device) network = Network.from_file(args.model_path, mode=Network.Mode(minibatch=False), default_device=default_device) log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base) if (args.log_base is not None) and (args.lattice_format == 'kaldi'): logging.info("Warning: Kaldi lattice reader doesn't support logarithm " "base conversion.") if args.wi_penalty is None: wi_penalty = None else: wi_penalty = args.wi_penalty * log_scale decoding_options = { 'nnlm_weight': args.nnlm_weight, 'lm_scale': args.lm_scale, 'wi_penalty': wi_penalty, 'unk_penalty': args.unk_penalty, 'use_shortlist': args.shortlist, 'unk_from_lattice': args.unk_from_lattice, 'linear_interpolation': args.linear_interpolation, 'max_tokens_per_node': args.max_tokens_per_node, 'beam': args.beam, 'recombination_order': args.recombination_order, 'prune_relative': args.prune_relative, 'abs_min_max_tokens': args.abs_min_max_tokens, 'abs_min_beam': args.abs_min_beam } logging.debug("DECODING OPTIONS") for option_name, option_value in decoding_options.items(): logging.debug("%s: %s", option_name, str(option_value)) logging.info("Building word lattice decoder.") decoder = LatticeDecoder(network, decoding_options) batch = LatticeBatch(args.lattices, args.lattice_list, args.lattice_format, args.kaldi_vocabulary, args.num_jobs, args.job) for lattice_number, lattice in enumerate(batch): if lattice.utterance_id is None: lattice.utterance_id = str(lattice_number) logging.info("Utterance `%s´ -- %d of job %d", lattice.utterance_id, lattice_number + 1, args.job) log_free_mem() final_tokens, recomb_tokens = decoder.decode(lattice) if (args.output == "slf") or (args.output == "kaldi"): rescored_lattice = RescoredLattice(lattice, final_tokens, recomb_tokens, network.vocabulary) rescored_lattice.lm_scale = args.lm_scale rescored_lattice.wi_penalty = args.wi_penalty if args.output == "slf": rescored_lattice.write_slf(args.output_file) else: assert args.output == "kaldi" rescored_lattice.write_kaldi(args.output_file, batch.kaldi_word_to_id) else: for token in final_tokens[:min(args.n_best, len(final_tokens))]: line = format_token(token, lattice.utterance_id, network.vocabulary, log_scale, args.output) args.output_file.write(line + "\n") gc.collect()
def decode(args): log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") sys.stdout.flush() network.set_state(state) log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base) if args.wi_penalty is None: wi_penalty = None else: wi_penalty = args.wi_penalty * log_scale if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty decoding_options = { 'nnlm_weight': args.nnlm_weight, 'lm_scale': args.lm_scale, 'wi_penalty': wi_penalty, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty, 'linear_interpolation': args.linear_interpolation, 'max_tokens_per_node': args.max_tokens_per_node, 'beam': args.beam, 'recombination_order': args.recombination_order } logging.debug("DECODING OPTIONS") for option_name, option_value in decoding_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building word lattice decoder.") sys.stdout.flush() decoder = LatticeDecoder(network, decoding_options) # Combine paths from command line and lattice list. lattices = args.lattices lattices.extend(args.lattice_list.readlines()) lattices = [path.strip() for path in lattices] # Ignore empty lines in the lattice list. lattices = list(filter(None, lattices)) # Pick every Ith lattice, if --num-jobs is specified and > 1. if args.num_jobs < 1: print("Invalid number of jobs specified:", args.num_jobs) sys.exit(1) if (args.job < 0) or (args.job > args.num_jobs - 1): print("Invalid job specified:", args.job) sys.exit(1) lattices = lattices[args.job::args.num_jobs] file_type = TextFileType('r') for index, path in enumerate(lattices): logging.info("Reading word lattice: %s", path) lattice_file = file_type(path) lattice = SLFLattice(lattice_file) if not lattice.utterance_id is None: utterance_id = lattice.utterance_id else: utterance_id = os.path.basename(lattice_file.name) logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id, index + 1, len(lattices), args.job) tokens = decoder.decode(lattice) for index in range(min(args.n_best, len(tokens))): line = format_token(tokens[index], utterance_id, vocabulary, log_scale, args.output) args.output_file.write(line + "\n")
def train(args): """A function that performs the "theanolm train" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' logging.info("Enabled computing test values for tensor variables.") logging.warning("GpuArray backend will fail random number generation!") else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'a', driver='core') as state: vocabulary = _read_vocabulary(args, state) if args.num_noise_samples > vocabulary.num_classes(): print("Number of noise samples ({}) is larger than the number of " "classes. This doesn't make sense and would cause unigram " "sampling to fail.".format(args.num_noise_samples)) sys.exit(1) num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight if len(args.sampling) > num_training_files: print("You specified more sampling coefficients than training " "files.") sys.exit(1) training_options = { 'batch_size': args.batch_size, 'sequence_length': args.sequence_length, 'validation_frequency': args.validation_frequency, 'patience': args.patience, 'stopping_criterion': args.stopping_criterion, 'max_epochs': args.max_epochs, 'min_epochs': args.min_epochs, 'max_annealing_count': args.max_annealing_count } optimization_options = { 'method': args.optimization_method, 'epsilon': args.numerical_stability_term, 'gradient_decay_rate': args.gradient_decay_rate, 'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate, 'learning_rate': args.learning_rate, 'weights': weights, 'momentum': args.momentum, 'max_gradient_norm': args.gradient_normalization, 'num_noise_samples': args.num_noise_samples, 'noise_sharing': args.noise_sharing, } log_options(training_options, optimization_options, args) logging.info("Creating trainer.") trainer = Trainer(training_options, vocabulary, args.training_set, args.sampling) trainer.set_logging(args.log_interval) logging.info("Building neural network.") if args.architecture == 'lstm300' or args.architecture == 'lstm1500': architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, 'rt', encoding='utf-8') as arch_file: architecture = Architecture.from_description(arch_file) default_device = get_default_device(args.default_device) network = Network(architecture, vocabulary, trainer.class_prior_probs, default_device=default_device, profile=args.profile) network.set_sampling(args.noise_distribution, args.noise_dampening, args.noise_sharing) logging.info("Building optimizer.") exclude_id = vocabulary.word_to_id['<unk>'] if args.exclude_unk \ else None epsilon = args.numerical_stability_term if args.cost == 'cross-entropy': cost_function = CrossEntropyCost(network, exclude_id, args.l1_regularization, args.l2_regularization, epsilon) elif args.cost == 'nce': cost_function = NCECost(network, exclude_id, args.l1_regularization, args.l2_regularization, epsilon) else: assert args.cost == 'blackout' cost_function = BlackoutCost(network, exclude_id, args.l1_regularization, args.l2_regularization, epsilon) try: optimizer = create_optimizer(optimization_options, network, cost_function, profile=args.profile) except theano.gradient.DisconnectedInputError as e: print("Cannot train the neural network because some of the " "parameters are disconnected from the output. Make sure all " "the layers are correctly connected in the network " "architecture. The error message was: `{}´".format(e)) if args.print_graph: print("Cost function computation graph:") theano.printing.debugprint(optimizer.gradient_update_function) trainer.initialize(network, state, optimizer, args.load_and_train) if args.validation_file is not None: logging.info("Building text scorer for cross-validation.") scorer = TextScorer(network, use_shortlist=True, exclude_unk=args.exclude_unk, profile=args.profile) logging.info("Validation text: %s", args.validation_file.name) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = \ LinearBatchIterator(validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=args.sequence_length, map_oos_to_unk=False) trainer.set_validation(validation_iter, scorer) else: logging.info("Cross-validation will not be performed.") validation_iter = None logging.info("Training neural network.") trainer.train() if 'layers' not in state.keys(): print("The model has not been trained. No cross-validations were " "performed or training did not improve the model.") elif validation_iter is not None: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def train(args): numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): raise ValueError("Invalid logging level requested: " + args.log_level) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'a', driver='core') as state: if state.keys(): print("Reading vocabulary from existing network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) elif args.vocabulary is None: print("Constructing vocabulary from training set.") sys.stdout.flush() vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for training_file in args.training_set: training_file.seek(0) vocabulary.get_state(state) else: print("Reading vocabulary from {}.".format(args.vocabulary)) sys.stdout.flush() with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file: vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format) if args.vocabulary_format == 'classes': print("Computing class membership probabilities from " "unigram word counts.") sys.stdout.flush() vocabulary.compute_probs(args.training_set) vocabulary.get_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() if args.architecture == 'lstm300' or args.architecture == 'lstm1500': architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, 'rt', encoding='utf-8') as arch_file: architecture = Architecture.from_description(arch_file) network = Network(vocabulary, architecture, profile=args.profile) sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight print("Building text scorer.") scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = LinearBatchIterator(validation_mmap, vocabulary, batch_size=32) optimization_options = { 'method': args.optimization_method, 'epsilon': args.numerical_stability_term, 'gradient_decay_rate': args.gradient_decay_rate, 'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate, 'learning_rate': args.learning_rate, 'weights': weights, 'momentum': args.momentum, 'max_gradient_norm': args.gradient_normalization, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty } logging.debug("OPTIMIZATION OPTIONS") for option_name, option_value in optimization_options.items(): if type(option_value) is list: value_str = ', '.join(str(x) for x in option_value) logging.debug("%s: [%s]", option_name, value_str) else: logging.debug("%s: %s", option_name, str(option_value)) training_options = { 'strategy': args.training_strategy, 'batch_size': args.batch_size, 'sequence_length': args.sequence_length, 'validation_frequency': args.validation_frequency, 'patience': args.patience, 'stopping_criterion': args.stopping_criterion, 'max_epochs': args.max_epochs, 'min_epochs': args.min_epochs, 'max_annealing_count': args.max_annealing_count } logging.debug("TRAINING OPTIONS") for option_name, option_value in training_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building neural network trainer.") sys.stdout.flush() if len(args.sampling) > len(args.training_set): print("You specified more sampling coefficients than training " "files.") sys.exit(1) trainer = create_trainer( training_options, optimization_options, network, vocabulary, scorer, args.training_set, args.sampling, validation_iter, state, args.profile) trainer.set_logging(args.log_interval) print("Training neural network.") sys.stdout.flush() trainer.run() if not state.keys(): print("The model has not been trained.") else: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def train(args): numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = "%(asctime)s %(funcName)s: %(message)s" if args.log_file == "-": logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = "warn" print("Enabled computing test values for tensor variables.") print("Warning: GpuArray backend will fail random number generation!") else: theano.config.compute_test_value = "off" theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, "a", driver="core") as state: if state.keys(): print("Reading vocabulary from existing network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) elif args.vocabulary is None: print("Constructing vocabulary from training set.") sys.stdout.flush() vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for training_file in args.training_set: training_file.seek(0) vocabulary.get_state(state) else: print("Reading vocabulary from {}.".format(args.vocabulary)) sys.stdout.flush() with open(args.vocabulary, "rt", encoding="utf-8") as vocab_file: vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format) if args.vocabulary_format == "classes": print("Computing class membership probabilities from " "unigram word counts.") sys.stdout.flush() vocabulary.compute_probs(args.training_set) vocabulary.get_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) if args.num_noise_samples > vocabulary.num_classes(): print( "Number of noise samples ({}) is larger than the number of " "classes. This doesn't make sense and would cause sampling " "to fail.".format(args.num_noise_samples) ) sys.exit(1) if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight training_options = { "batch_size": args.batch_size, "sequence_length": args.sequence_length, "validation_frequency": args.validation_frequency, "patience": args.patience, "stopping_criterion": args.stopping_criterion, "max_epochs": args.max_epochs, "min_epochs": args.min_epochs, "max_annealing_count": args.max_annealing_count, } logging.debug("TRAINING OPTIONS") for option_name, option_value in training_options.items(): logging.debug("%s: %s", option_name, str(option_value)) optimization_options = { "method": args.optimization_method, "epsilon": args.numerical_stability_term, "gradient_decay_rate": args.gradient_decay_rate, "sqr_gradient_decay_rate": args.sqr_gradient_decay_rate, "learning_rate": args.learning_rate, "weights": weights, "momentum": args.momentum, "max_gradient_norm": args.gradient_normalization, "cost_function": args.cost, "num_noise_samples": args.num_noise_samples, "noise_sharing": args.noise_sharing, "ignore_unk": ignore_unk, "unk_penalty": unk_penalty, } logging.debug("OPTIMIZATION OPTIONS") for option_name, option_value in optimization_options.items(): if type(option_value) is list: value_str = ", ".join(str(x) for x in option_value) logging.debug("%s: [%s]", option_name, value_str) else: logging.debug("%s: %s", option_name, str(option_value)) if len(args.sampling) > len(args.training_set): print("You specified more sampling coefficients than training " "files.") sys.exit(1) print("Creating trainer.") sys.stdout.flush() trainer = Trainer(training_options, vocabulary, args.training_set, args.sampling) trainer.set_logging(args.log_interval) print("Building neural network.") sys.stdout.flush() if args.architecture == "lstm300" or args.architecture == "lstm1500": architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, "rt", encoding="utf-8") as arch_file: architecture = Architecture.from_description(arch_file) network = Network( architecture, vocabulary, trainer.class_prior_probs, args.noise_dampening, default_device=args.default_device, profile=args.profile, ) print("Compiling optimization function.") sys.stdout.flush() optimizer = create_optimizer(optimization_options, network, device=args.default_device, profile=args.profile) if args.print_graph: print("Cost function computation graph:") theano.printing.debugprint(optimizer.gradient_update_function) trainer.initialize(network, state, optimizer) if not args.validation_file is None: print("Building text scorer for cross-validation.") sys.stdout.flush() scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile) print("Validation text:", args.validation_file.name) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = LinearBatchIterator( validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=None ) trainer.set_validation(validation_iter, scorer) else: print("Cross-validation will not be performed.") validation_iter = None print("Training neural network.") sys.stdout.flush() trainer.train() if not "layers" in state.keys(): print( "The model has not been trained. No cross-validations were " "performed or training did not improve the model." ) elif not validation_iter is None: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def decode(args): """A function that performs the "theanolm decode" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile network = Network.from_file(args.model_path, mode=Network.Mode(minibatch=False)) log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base) if args.wi_penalty is None: wi_penalty = None else: wi_penalty = args.wi_penalty * log_scale if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty decoding_options = { 'nnlm_weight': args.nnlm_weight, 'lm_scale': args.lm_scale, 'wi_penalty': wi_penalty, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty, 'linear_interpolation': args.linear_interpolation, 'max_tokens_per_node': args.max_tokens_per_node, 'beam': args.beam, 'recombination_order': args.recombination_order } logging.debug("DECODING OPTIONS") for option_name, option_value in decoding_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building word lattice decoder.") sys.stdout.flush() decoder = LatticeDecoder(network, decoding_options) # Combine paths from command line and lattice list. lattices = args.lattices if args.lattice_list is not None: lattices.extend(args.lattice_list.readlines()) lattices = [path.strip() for path in lattices] # Ignore empty lines in the lattice list. lattices = [x for x in lattices if x] # Pick every Ith lattice, if --num-jobs is specified and > 1. if args.num_jobs < 1: print("Invalid number of jobs specified:", args.num_jobs) sys.exit(1) if (args.job < 0) or (args.job > args.num_jobs - 1): print("Invalid job specified:", args.job) sys.exit(1) lattices = lattices[args.job::args.num_jobs] file_type = TextFileType('r') for index, path in enumerate(lattices): logging.info("Reading word lattice: %s", path) lattice_file = file_type(path) lattice = SLFLattice(lattice_file) if lattice.utterance_id is not None: utterance_id = lattice.utterance_id else: utterance_id = os.path.basename(lattice_file.name) logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id, index + 1, len(lattices), args.job) tokens = decoder.decode(lattice) for index in range(min(args.n_best, len(tokens))): line = format_token(tokens[index], utterance_id, network.vocabulary, log_scale, args.output) args.output_file.write(line + "\n")