Пример #1
0
def sample(args):
    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary,
                          architecture,
                          mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        network.set_state(state)

    print("Building text sampler.")
    sys.stdout.flush()
    sampler = TextSampler(network)

    sequences = sampler.generate(30, args.num_sentences)
    for sequence in sequences:
        try:
            eos_pos = sequence.index('</s>')
            sequence = sequence[:eos_pos + 1]
        except:
            pass
        args.output_file.write(' '.join(sequence) + '\n')
Пример #2
0
def sample(args):
    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary, architecture,
                          predict_next_distribution=True)
        print("Restoring neural network state.")
        network.set_state(state)

    print("Building text sampler.")
    sys.stdout.flush()
    sampler = TextSampler(network)

    for i in range(args.num_sentences):
        words = sampler.generate()
        args.output_file.write('{}: {}\n'.format(
            i, ' '.join(words)))
Пример #3
0
def sample(args):
    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        network.set_state(state)

    print("Building text sampler.")
    sys.stdout.flush()
    sampler = TextSampler(network)

    sequences = sampler.generate(30, args.num_sentences)
    for sequence in sequences:
        try:
            eos_pos = sequence.index('</s>')
            sequence = sequence[:eos_pos+1]
        except:
            pass
        args.output_file.write(' '.join(sequence) + '\n')
Пример #4
0
def restoreModel(path):
    with h5py.File(path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of words in shortlist:", vocabulary.num_shortlist_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        network.set_state(state)
        return network
Пример #5
0
def sample(args):
    """A function that performs the "theanolm sample" command.

    :type args: argparse.Namespace
    :param args: a collection of command line arguments
    """

    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        logging.info("Reading vocabulary from network state.")
        vocabulary = Vocabulary.from_state(state)
        logging.info("Number of words in vocabulary: %d",
                     vocabulary.num_words())
        logging.info("Number of words in shortlist: %d",
                     vocabulary.num_shortlist_words())
        logging.info("Number of word classes: %d", vocabulary.num_classes())
        logging.info("Building neural network.")
        architecture = Architecture.from_state(state)
        default_device = get_default_device(args.default_device)
        network = Network(architecture,
                          vocabulary,
                          mode=Network.Mode(minibatch=False),
                          default_device=default_device)
        logging.info("Restoring neural network state.")
        network.set_state(state)

    logging.info("Building text sampler.")
    sampler = TextSampler(network)

    sequences = sampler.generate(args.sentence_length,
                                 args.num_sentences,
                                 seed_sequence=args.seed_sequence)
    for sequence in sequences:
        try:
            eos_pos = sequence.index('</s>')
            sequence = sequence[:eos_pos + 1]
        except ValueError:
            pass
        args.output_file.write(' '.join(sequence) + '\n')
Пример #6
0
def score(args):
    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary)
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    print("Building text scorer.")
    sys.stdout.flush()
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    scorer = TextScorer(network, ignore_unk, unk_penalty)

    print("Scoring text.")
    if args.output == 'perplexity':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, False)
    elif args.output == 'word-scores':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, True)
    elif args.output == 'utterance-scores':
        _score_utterances(args.input_file, vocabulary, scorer, args.output_file,
                          args.log_base)
    else:
        print("Invalid output format requested:", args.output)
        sys.exit(1)
Пример #7
0
    def __init__(self, model_path):
        self.model_path = model_path
        numpy.random.seed()
        theano.config.compute_test_value = 'off'

        with h5py.File(model_path, 'r') as self.state:
            print("Reading vocabulary from network state.")
            #sys.stdout.flush()
            self.vocabulary = Vocabulary.from_state(self.state)
            print("Number of words in vocabulary:", self.vocabulary.num_words())
            print("Number of words in shortlist:", self.vocabulary.num_shortlist_words())
            print("Number of word classes:", self.vocabulary.num_classes())
            print("Building neural network.")
            #sys.stdout.flush()
            self.architecture = Architecture.from_state(self.state)
            self.network = Network(self.architecture, self.vocabulary, mode=Network.Mode(minibatch=False))
            print("Restoring neural network state.")
            self.network.set_state(self.state)

        print("Building text sampler.")
        #sys.stdout.flush()
        self.sampler = TextSampler(self.network)
Пример #8
0
def score(args):
    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary, architecture)
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    print("Building text scorer.")
    sys.stdout.flush()
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    scorer = TextScorer(network, ignore_unk, unk_penalty)

    print("Scoring text.")
    if args.output == 'perplexity':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, False)
    elif args.output == 'word-scores':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, True)
    elif args.output == 'utterance-scores':
        _score_utterances(args.input_file, vocabulary, scorer,
                          args.output_file, args.log_base)
Пример #9
0
def decode(args):
    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary,
                          mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base)

    if args.wi_penalty is None:
        wi_penalty = None
    else:
        wi_penalty = args.wi_penalty * log_scale
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    decoding_options = {
        'nnlm_weight': args.nnlm_weight,
        'lm_scale': args.lm_scale,
        'wi_penalty': wi_penalty,
        'ignore_unk': ignore_unk,
        'unk_penalty': unk_penalty,
        'linear_interpolation': args.linear_interpolation,
        'max_tokens_per_node': args.max_tokens_per_node,
        'beam': args.beam,
        'recombination_order': args.recombination_order
    }
    logging.debug("DECODING OPTIONS")
    for option_name, option_value in decoding_options.items():
        logging.debug("%s: %s", option_name, str(option_value))

    print("Building word lattice decoder.")
    sys.stdout.flush()
    decoder = LatticeDecoder(network, decoding_options)

    # Combine paths from command line and lattice list.
    lattices = args.lattices
    lattices.extend(args.lattice_list.readlines())
    lattices = [path.strip() for path in lattices]
    # Ignore empty lines in the lattice list.
    lattices = list(filter(None, lattices))
    # Pick every Ith lattice, if --num-jobs is specified and > 1.
    if args.num_jobs < 1:
        print("Invalid number of jobs specified:", args.num_jobs)
        sys.exit(1)
    if (args.job < 0) or (args.job > args.num_jobs - 1):
        print("Invalid job specified:", args.job)
        sys.exit(1)
    lattices = lattices[args.job::args.num_jobs]

    file_type = TextFileType('r')
    for index, path in enumerate(lattices):
        logging.info("Reading word lattice: %s", path)
        lattice_file = file_type(path)
        lattice = SLFLattice(lattice_file)

        if not lattice.utterance_id is None:
            utterance_id = lattice.utterance_id
        else:
            utterance_id = os.path.basename(lattice_file.name)
        logging.info("Utterance `%s' -- %d/%d of job %d",
                     utterance_id,
                     index + 1,
                     len(lattices),
                     args.job)
        tokens = decoder.decode(lattice)

        for index in range(min(args.n_best, len(tokens))):
            line = format_token(tokens[index],
                                utterance_id,
                                vocabulary,
                                log_scale,
                                args.output)
            args.output_file.write(line + "\n")
Пример #10
0
def decode(args):
    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary,
                          architecture,
                          mode=Network.Mode.target_words)
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base)

    if args.wi_penalty is None:
        wi_penalty = None
    else:
        wi_penalty = args.wi_penalty * log_scale
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    decoding_options = {
        'nnlm_weight': args.nnlm_weight,
        'lm_scale': args.lm_scale,
        'wi_penalty': wi_penalty,
        'ignore_unk': ignore_unk,
        'unk_penalty': unk_penalty,
        'linear_interpolation': args.linear_interpolation,
        'max_tokens_per_node': args.max_tokens_per_node,
        'beam': args.beam,
        'recombination_order': args.recombination_order
    }
    logging.debug("DECODING OPTIONS")
    for option_name, option_value in decoding_options.items():
        logging.debug("%s: %s", option_name, str(option_value))

    print("Building word lattice decoder.")
    sys.stdout.flush()
    decoder = LatticeDecoder(network, decoding_options)

    # Combine paths from command line and lattice list.
    lattices = args.lattices
    lattices.extend(args.lattice_list.readlines())
    lattices = [path.strip() for path in lattices]
    # Ignore empty lines in the lattice list.
    lattices = list(filter(None, lattices))
    # Pick every Ith lattice, if --num-jobs is specified and > 1.
    if args.num_jobs < 1:
        print("Invalid number of jobs specified:", args.num_jobs)
        sys.exit(1)
    if (args.job < 0) or (args.job > args.num_jobs - 1):
        print("Invalid job specified:", args.job)
        sys.exit(1)
    lattices = lattices[args.job::args.num_jobs]

    file_type = TextFileType('r')
    for index, path in enumerate(lattices):
        logging.info("Reading word lattice: %s", path)
        lattice_file = file_type(path)
        lattice = SLFLattice(lattice_file)

        if not lattice.utterance_id is None:
            utterance_id = lattice.utterance_id
        else:
            utterance_id = os.path.basename(lattice_file.name)
        logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id,
                     index + 1, len(lattices), args.job)
        tokens = decoder.decode(lattice)

        for index in range(min(args.n_best, len(tokens))):
            line = format_token(tokens[index], utterance_id, vocabulary,
                                log_scale, args.output)
            args.output_file.write(line + "\n")