示例#1
0
def create_merged_ngram_dictionaries(indices, n):
    """Generate a single dictionary for the full batch.
    Args:
      indices:  List of lists of indices.
      n:  Degree of n-grams.
    Returns:
      Dictionary of hashed(n-gram tuples) to counts in the batch of indices.
    """
    ngram_dicts = []

    for ind in indices:
        ngrams = n_gram.find_all_ngrams(ind, n=n)
        ngram_counts = n_gram.construct_ngrams_dict(ngrams)
        ngram_dicts.append(ngram_counts)

    merged_gen_dict = Counter()
    for ngram_dict in ngram_dicts:
        merged_gen_dict += Counter(ngram_dict)
    return merged_gen_dict
示例#2
0
def create_merged_ngram_dictionaries(indices, n):
  """Generate a single dictionary for the full batch.

  Args:
    indices:  List of lists of indices.
    n:  Degree of n-grams.

  Returns:
    Dictionary of hashed(n-gram tuples) to counts in the batch of indices.
  """
  ngram_dicts = []

  for ind in indices:
    ngrams = n_gram.find_all_ngrams(ind, n=n)
    ngram_counts = n_gram.construct_ngrams_dict(ngrams)
    ngram_dicts.append(ngram_counts)

  merged_gen_dict = Counter()
  for ngram_dict in ngram_dicts:
    merged_gen_dict += Counter(ngram_dict)
  return merged_gen_dict
def main(_):
  hparams = create_hparams()
  train_dir = FLAGS.base_directory + '/train'

  # Load data set.
  if FLAGS.data_set == 'ptb':
    raw_data = ptb_loader.ptb_raw_data(FLAGS.data_dir)
    train_data, valid_data, test_data, _ = raw_data
    valid_data_flat = valid_data
  elif FLAGS.data_set == 'imdb':
    raw_data = imdb_loader.imdb_raw_data(FLAGS.data_dir)
    # TODO(liamfedus): Get an IMDB test partition.
    train_data, valid_data = raw_data
    valid_data_flat = [word for review in valid_data for word in review]
  else:
    raise NotImplementedError

  if FLAGS.mode == MODE_TRAIN or FLAGS.mode == MODE_TRAIN_EVAL:
    data_set = train_data
  elif FLAGS.mode == MODE_VALIDATION:
    data_set = valid_data
  elif FLAGS.mode == MODE_TEST:
    data_set = test_data
  else:
    raise NotImplementedError

  # Dictionary and reverse dictionry.
  if FLAGS.data_set == 'ptb':
    word_to_id = ptb_loader.build_vocab(
        os.path.join(FLAGS.data_dir, 'ptb.train.txt'))
  elif FLAGS.data_set == 'imdb':
    word_to_id = imdb_loader.build_vocab(
        os.path.join(FLAGS.data_dir, 'vocab.txt'))
  id_to_word = {v: k for k, v in word_to_id.items()}

  # Dictionary of Training Set n-gram counts.
  bigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=2)
  trigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=3)
  fourgram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=4)

  bigram_counts = n_gram.construct_ngrams_dict(bigram_tuples)
  trigram_counts = n_gram.construct_ngrams_dict(trigram_tuples)
  fourgram_counts = n_gram.construct_ngrams_dict(fourgram_tuples)
  print('Unique %d-grams: %d' % (2, len(bigram_counts)))
  print('Unique %d-grams: %d' % (3, len(trigram_counts)))
  print('Unique %d-grams: %d' % (4, len(fourgram_counts)))

  data_ngram_counts = {
      '2': bigram_counts,
      '3': trigram_counts,
      '4': fourgram_counts
  }

  # TODO(liamfedus):  This was necessary because there was a problem with our
  # originally trained IMDB models.  The EOS_INDEX was off by one, which means,
  # two words were mapping to index 86933.  The presence of '</s>' is going
  # to throw and out of vocabulary error.
  FLAGS.vocab_size = len(id_to_word)
  print('Vocab size: %d' % FLAGS.vocab_size)

  tf.gfile.MakeDirs(FLAGS.base_directory)

  if FLAGS.mode == MODE_TRAIN:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'train-log.txt'), mode='w')
  elif FLAGS.mode == MODE_VALIDATION:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'validation-log.txt'), mode='w')
  elif FLAGS.mode == MODE_TRAIN_EVAL:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'train_eval-log.txt'), mode='w')
  else:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'test-log.txt'), mode='w')

  if FLAGS.mode == MODE_TRAIN:
    train_model(hparams, data_set, train_dir, log, id_to_word,
                data_ngram_counts)

  elif FLAGS.mode == MODE_VALIDATION:
    evaluate_model(hparams, data_set, train_dir, log, id_to_word,
                   data_ngram_counts)
  elif FLAGS.mode == MODE_TRAIN_EVAL:
    evaluate_model(hparams, data_set, train_dir, log, id_to_word,
                   data_ngram_counts)

  elif FLAGS.mode == MODE_TEST:
    evaluate_model(hparams, data_set, train_dir, log, id_to_word,
                   data_ngram_counts)

  else:
    raise NotImplementedError
示例#4
0
def main(_):
  hparams = create_hparams()
  train_dir = FLAGS.base_directory + '/train'

  # Load data set.
  if FLAGS.data_set == 'ptb':
    raw_data = ptb_loader.ptb_raw_data(FLAGS.data_dir)
    train_data, valid_data, test_data, _ = raw_data
    valid_data_flat = valid_data
  elif FLAGS.data_set == 'imdb':
    raw_data = imdb_loader.imdb_raw_data(FLAGS.data_dir)
    # TODO(liamfedus): Get an IMDB test partition.
    train_data, valid_data = raw_data
    valid_data_flat = [word for review in valid_data for word in review]
  else:
    raise NotImplementedError

  if FLAGS.mode == MODE_TRAIN or FLAGS.mode == MODE_TRAIN_EVAL:
    data_set = train_data
  elif FLAGS.mode == MODE_VALIDATION:
    data_set = valid_data
  elif FLAGS.mode == MODE_TEST:
    data_set = test_data
  else:
    raise NotImplementedError

  # Dictionary and reverse dictionry.
  if FLAGS.data_set == 'ptb':
    word_to_id = ptb_loader.build_vocab(
        os.path.join(FLAGS.data_dir, 'ptb.train.txt'))
  elif FLAGS.data_set == 'imdb':
    word_to_id = imdb_loader.build_vocab(
        os.path.join(FLAGS.data_dir, 'vocab.txt'))
  id_to_word = {v: k for k, v in word_to_id.iteritems()}

  # Dictionary of Training Set n-gram counts.
  bigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=2)
  trigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=3)
  fourgram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=4)

  bigram_counts = n_gram.construct_ngrams_dict(bigram_tuples)
  trigram_counts = n_gram.construct_ngrams_dict(trigram_tuples)
  fourgram_counts = n_gram.construct_ngrams_dict(fourgram_tuples)
  print('Unique %d-grams: %d' % (2, len(bigram_counts)))
  print('Unique %d-grams: %d' % (3, len(trigram_counts)))
  print('Unique %d-grams: %d' % (4, len(fourgram_counts)))

  data_ngram_counts = {
      '2': bigram_counts,
      '3': trigram_counts,
      '4': fourgram_counts
  }

  # TODO(liamfedus):  This was necessary because there was a problem with our
  # originally trained IMDB models.  The EOS_INDEX was off by one, which means,
  # two words were mapping to index 86933.  The presence of '</s>' is going
  # to throw and out of vocabulary error.
  FLAGS.vocab_size = len(id_to_word)
  print('Vocab size: %d' % FLAGS.vocab_size)

  tf.gfile.MakeDirs(FLAGS.base_directory)

  if FLAGS.mode == MODE_TRAIN:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'train-log.txt'), mode='w')
  elif FLAGS.mode == MODE_VALIDATION:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'validation-log.txt'), mode='w')
  elif FLAGS.mode == MODE_TRAIN_EVAL:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'train_eval-log.txt'), mode='w')
  else:
    log = tf.gfile.GFile(
        os.path.join(FLAGS.base_directory, 'test-log.txt'), mode='w')

  if FLAGS.mode == MODE_TRAIN:
    train_model(hparams, data_set, train_dir, log, id_to_word,
                data_ngram_counts)

  elif FLAGS.mode == MODE_VALIDATION:
    evaluate_model(hparams, data_set, train_dir, log, id_to_word,
                   data_ngram_counts)
  elif FLAGS.mode == MODE_TRAIN_EVAL:
    evaluate_model(hparams, data_set, train_dir, log, id_to_word,
                   data_ngram_counts)

  elif FLAGS.mode == MODE_TEST:
    evaluate_model(hparams, data_set, train_dir, log, id_to_word,
                   data_ngram_counts)

  else:
    raise NotImplementedError