Пример #1
0
def main(unused_argv):
    vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN)

    if FLAGS.mode == 'eval':
        dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab)
        _EvalModel(dataset)
    elif FLAGS.mode == 'sample':
        _SampleModel(FLAGS.prefix, vocab)
    elif FLAGS.mode == 'dump_emb':
        _DumpEmb(vocab)
    elif FLAGS.mode == 'dump_lstm_emb':
        _DumpSentenceEmbedding(FLAGS.sentence, vocab)
    elif FLAGS.mode == 'predict_perp':
        sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)
        current_step = t['global_step'].eval(session=sess)

        sentences = []
        with open(FLAGS.input_data, 'r') as f:
            sentences = []
            for line in f:
                ls = line[:-1].split("\t")
                sentences.append(ls)

            best_sentences = []
            for i in range(len(sentences)):
                print("Test sentence: " + str(i))

                perplexities = []
                for j in range(len(sentences[i])):
                    if j % 5 == 0:
                        print("Output sentence: " + str(j))

                    with open("temp_sent.txt", 'w') as f:
                        f.write(sentences[i][j])

                    print("Loading data...")
                    dataset = data_utils.LM1BDataset("temp_sent.txt", vocab)
                    print("Calculating perplexity...")
                    perplexities.append(_EvalModel(dataset, current_step))

                ind = perplexities.index(min(perplexities))
                best_sentences.append(sentences[i][ind])

            ## Return best sentences
            with open(FLAGS.output_data, 'w') as f:
                for sent in best_sentences:
                    f.write(sent + "\n")
    else:
        raise Exception('Mode not supported.')
Пример #2
0
def _SentencePerplexity(dataset_file, vocab):
    dataset = data_utils.LM1BDataset(dataset_file, vocab)
    sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)
    current_step = t['global_step'].eval(session=sess)
    sys.stderr.write('Loaded step %d.\n' % current_step)

    data_gen = dataset.get_batch(BATCH_SIZE, NUM_TIMESTEPS, forever=False)
    sys.stderr.write('Loaded data gen\n')
    total_sum_num = 0.0
    sum_num = 0.0
    total_sum_den = 0.0
    sum_den = 0.0
    sentence_id = None
    t0 = time.time()
    perps = []
    for i, (inputs, char_inputs, sentence_ids, targets,
            weights) in enumerate(data_gen):
        next_sentence_id = sentence_ids[0][0]
        if sentence_id is None:
            sentence_id = next_sentence_id
        # We hit a new sentence. Record this one and reset counters
        if next_sentence_id != sentence_id:
            perplexity = np.exp(sum_num / sum_den)
            print('{}\t{}\t{}'.format(perplexity, sentence_id,
                                      '_'.join(str(p) for p in perps)))
            sum_num = sum_den = 0.0
            perps = []
            sentence_id = next_sentence_id
            if (sentence_id % 5) == 0:
                t1 = time.time()
                sys.stderr.write('Starting sentence {} (t={:.1f}s)\n'.format(
                    sentence_id, t1 - t0))
                ppx = np.exp(total_sum_num / total_sum_den)
                sys.stderr.write(
                    'Running avg. perplexity: {:.3f}\n'.format(ppx))
                t0 = t1

        input_dict = {
            t['inputs_in']: inputs,
            t['targets_in']: targets,
            t['target_weights_in']: weights
        }
        if 'char_inputs_in' in t:
            input_dict[t['char_inputs_in']] = char_inputs
        log_perp = sess.run(t['log_perplexity_out'], feed_dict=input_dict)

        if np.isnan(log_perp):
            sys.stderr.error('log_perplexity is Nan.\n')
        else:
            num = log_perp * weights.mean()
            sum_num += num
            total_sum_num += num
            den = weights.mean()
            sum_den += den
            total_sum_den += den
            perps.append(log_perp)

        if sentence_id > FLAGS.max_eval_steps:
            break
    ppx = np.exp(total_sum_num / total_sum_den)
def _RunN400Experiment(input_data, vocab):
    """Calculates the Cross-Entropy Loss of a given word and its predicted
  probability based on context.

  Args:
    input_file: A file containing target sentences.
        Each sentence must be separated by a newline character and have:
            - Asterisks before and after target word (e.g. *target*).
            - Punctuation separated from words by a space.
    vocab: Vocabulary. Contains max word chard id length and converts between
        words and ids.
  """
    _CreateFiles(input_data)
    fname = FLAGS.output_file
    with tf.gfile.Open(fname, mode='w') as f:
        f.write('Sentence;TargetWord;Probability;Surprisal\n')
    with tf.gfile.Open('data/sentences.txt', mode='r') as sentences_file:
        all_sentences = sentences_file.readlines()
        all_sentences = [
            re.sub(r'\n', '', sentence) for sentence in all_sentences
        ]
    with tf.gfile.Open('data/targets.txt', mode='r') as target_file:
        all_targets = target_file.readlines()
        all_targets = [[int(index) for index in targets.rsplit()]
                       for targets in all_targets]
    for i in range(len(all_sentences)):
        current_sentence = all_sentences[i]
        with tf.gfile.Open('data/current.sentence.txt',
                           mode='w') as current_sentence_file:
            current_sentence_file.write(current_sentence)
        target_indices = all_targets[i]
        dataset = data_utils.LM1BDataset('data/current.sentence.txt', vocab)
        _WordLossAndProbability(dataset, vocab, current_sentence,
                                target_indices)
Пример #4
0
def main(unused_argv):
    vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN)

    if FLAGS.mode == 'eval':
        dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab)
        _EvalModel(dataset)
    elif FLAGS.mode == 'sample':
        _SampleModel(FLAGS.prefix, vocab)
    elif FLAGS.mode == 'dump_emb':
        _DumpEmb(vocab)
    elif FLAGS.mode == 'dump_lstm_emb':
        _DumpSentenceEmbedding(FLAGS.sentence, vocab)
    else:
        raise Exception('Mode not supported.')
Пример #5
0
    def _EvalSentences(self, sentences):
        """Evaluate the log probability of the input sentences in the directory

            Args:
            vocab: vocabulary object.
            sentences: list of strings
        """
        print('Evaluating sentences')
        start_time = time.time()
        current_step = self.t['global_step'].eval(session=self.sess)
        sys.stderr.write('Loaded step %d.\n' % current_step)

        # instantiate a dataset generator
        dataset = data_utils.LM1BDataset(vocab=self.vocab)

        result_dfs = []
        for sentence in sentences:
            # set the sentence first
            dataset.sentence = sentence
            # then the call to batch with method "list" converts the sentence object
            data_gen = dataset.get_batch(self.BATCH_SIZE,
                                         self.NUM_TIMESTEPS,
                                         method='list',
                                         forever=False)
            word_probabilities = []
            words = []

            for i, (inputs, char_inputs, _, targets,
                    weights) in enumerate(data_gen):

                input_dict = {
                    self.t['inputs_in']: inputs,
                    self.t['targets_in']: targets,
                    self.t['target_weights_in']: weights
                }
                if 'char_inputs_in' in self.t:
                    input_dict[self.t['char_inputs_in']] = char_inputs

                log_perp = self.sess.run(self.t['log_perplexity_out'],
                                         feed_dict=input_dict)
                softmax = self.sess.run(self.t['softmax_out'],
                                        feed_dict=input_dict)

                log10_probability = -1 * np.log10(
                    softmax[0, targets[0][0]] / np.sum(softmax))
                sys.stderr.write(
                    self.vocab.id_to_word(targets[0][0]) + ' ' +
                    str(log10_probability) + '\n')

                words.append(self.vocab.id_to_word(targets[0][0]))
                word_probabilities.append(log10_probability)

            sys.stderr.write('Sentence perplexity: %s\n' %
                             str(np.sum(word_probabilities)))
            sys.stderr.write('Elapsed: %s\n' % str(time.time() - start_time))

            rdf = pd.DataFrame({'prob': word_probabilities, 'word': words})
            result_dfs.append(rdf)
            #!!! different serialization here

        return (pd.concat(result_dfs))
Пример #6
0
    def _EvalSentencesDicts(self, input_dict_list, base):
        """Evaluate the log probability of the input sentences in the directory

            Args:
            vocab: vocabulary object.
            input_dict_list: list of dictionaries
        """
        print('Evaluating sentences')
        start_time = time.time()
        current_step = self.t['global_step'].eval(session=self.sess)
        sys.stderr.write('Loaded step %d.\n' % current_step)

        # instantiate a dataset generator
        dataset = data_utils.LM1BDataset(vocab=self.vocab)

        utterance_dfs = []
        for input_row in input_dict_list:
            # set the sentence first
            # !!! drop the eos and bos
            eval_utterance = [
                x for x in input_row['utterance_list']
                if not (x in ('<s>', '</s>'))
            ]
            eval_utterance_string = ' '.join(eval_utterance)
            print('Evaluating:')
            print(eval_utterance_string)
            dataset.sentence = eval_utterance_string
            # then the call to batch with method "list" converts the sentence object
            data_gen = dataset.get_batch(self.BATCH_SIZE,
                                         self.NUM_TIMESTEPS,
                                         method='list',
                                         forever=False)

            word_by_word = []
            word_by_word.append(  #append a dummy start of sentence
                {
                    'token_id': 0,
                    'word': '<S>',
                    'log_prob': np.nan
                })

            for i, (inputs, char_inputs, _, targets,
                    weights) in enumerate(data_gen):

                input_dict = {
                    self.t['inputs_in']: inputs,
                    self.t['targets_in']: targets,
                    self.t['target_weights_in']: weights
                }
                if 'char_inputs_in' in self.t:
                    input_dict[self.t['char_inputs_in']] = char_inputs

                log_perp = self.sess.run(self.t['log_perplexity_out'],
                                         feed_dict=input_dict)
                softmax = self.sess.run(self.t['softmax_out'],
                                        feed_dict=input_dict)

                log10_probability = -1 * np.log10(
                    softmax[0, targets[0][0]] / np.sum(softmax))
                sys.stderr.write(
                    self.vocab.id_to_word(targets[0][0]) + ' ' +
                    str(log10_probability) + '\n')

                word_by_word.append({
                    'token_id':
                    i + 1,
                    'word':
                    self.vocab.id_to_word(targets[0][0]),
                    'log_prob':
                    math.log(10.**log10_probability, base)
                })

            utterance_df = pd.DataFrame(word_by_word)
            utterance_df['utterance_id'] = input_row['utterance_id']

            sys.stderr.write('Sentence perplexity: %s\n' %
                             str(np.sum(utterance_df.log_prob)))

            utterance_dfs.append(utterance_df)

        sys.stderr.write('Elapsed: %s\n' % str(time.time() - start_time))
        return (pd.concat(utterance_dfs))
Пример #7
0
def _EvalSentences(vocab):
    """Evaluate the log probability of the input sentences in the directory

  Args:
    dataset: LM1BDataset object.
  """
    start_time = time.time()
    sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)

    current_step = t['global_step'].eval(session=sess)
    sys.stderr.write('Loaded step %d.\n' % current_step)

    eval_files = glob.glob(os.path.join(FLAGS.eval_dir, '*.txt'))
    for eval_file in eval_files:
        print('Evaluating ' + eval_file + '...')
        start_time = time.time()
        dataset = data_utils.LM1BDataset(eval_file, vocab)
        data_gen = dataset.get_batch(BATCH_SIZE, NUM_TIMESTEPS, forever=False)
        word_probabilities = []
        words = []

        for i, (inputs, char_inputs, _, targets,
                weights) in enumerate(data_gen):

            input_dict = {
                t['inputs_in']: inputs,
                t['targets_in']: targets,
                t['target_weights_in']: weights
            }
            if 'char_inputs_in' in t:
                input_dict[t['char_inputs_in']] = char_inputs

            log_perp = sess.run(t['log_perplexity_out'], feed_dict=input_dict)
            softmax = sess.run(t['softmax_out'], feed_dict=input_dict)

            # infer the quan

            #sys.stderr.write('char_inputs: %s\n' %
            #  (' '.join([chr(x) for x in char_inputs[0][0]])+'\n'))

            #sys.stderr.write('Input index: %s\n' % str(inputs[0][0]))
            #sys.stderr.write('Indexed item: %s\n' % vocab.id_to_word(inputs[0][0]))

            #sys.stderr.write('Target index: %s\n' % str(targets[0][0]))
            #sys.stderr.write('Target item: %s\n' % vocab.id_to_word(targets[0][0]))

            log10_probability = -1 * np.log10(
                softmax[0, targets[0][0]] / np.sum(softmax))
            sys.stderr.write(
                vocab.id_to_word(targets[0][0]) + ' ' +
                str(log10_probability) + '\n')

            words.append(vocab.id_to_word(targets[0][0]))
            word_probabilities.append(log10_probability)

        sys.stderr.write('Sentence perplexity: %s\n' %
                         str(np.sum(word_probabilities)))
        sys.stderr.write('Elapsed: %s\n' % str(time.time() - start_time))

        rdf = pd.DataFrame({'prob': word_probabilities, 'word': words})
        rdf.to_csv(eval_file.replace('.txt', '.out'))