예제 #1
0
def test_load_dataset_from_text():
    root_dir = dirname(dirname(dirname(realpath(__file__))))
    path_txt = join(root_dir, 'data/small/train/source.txt')

    symbol_index = index_table_from_text(path_txt, sos=True, eos=True)
    dataset = load_dataset_from_text(join(root_dir, path_txt), symbol_index)

    with open(join(root_dir, path_txt), 'r') as f:
        num_lines = len(f.read().strip().split('\n'))
        assert num_lines == len(dataset)

    dataset_with_mark = load_dataset_from_text(join(root_dir, path_txt), symbol_index, sos=True, eos=True)
    assert len(dataset[0]) + 2 == len(dataset_with_mark[0])
예제 #2
0
def train():
    # Set the logger
    set_logger(os.path.join(params['model_dir'], 'train.log'))
    # log params
    logging.info(params)

    # Load vacabulary
    vocab = tf.contrib.lookup.index_table_from_file(vocab_path,
                                                    num_oov_buckets=1)

    # Create the input data pipeline
    logging.info('Creating the datasets...')
    train_input_words = load_dataset_from_text(data_dir, train_input_filename,
                                               vocab)
    train_context_words = load_dataset_from_text(data_dir,
                                                 train_context_filename, vocab)

    # Create the iterator over the dataset
    train_inputs = input_fn('train', train_input_words, train_context_words,
                            params)
    eval_inputs = input_fn('eval', train_input_words, train_context_words,
                           params)
    logging.info("- done")

    # Define the model
    logging.info('Creating the model...')
    train_model_spec = model_fn('train',
                                train_inputs,
                                params,
                                reuse=tf.AUTO_REUSE)
    eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True)
    logging.info('- done.')

    # Train the model
    logging.info('Starting training for {} epochs'.format(
        params['num_epochs']))
    normalized_embedding_matrix = train_and_evaluate(train_model_spec,
                                                     eval_model_spec, params)

    save_dict_to_json(params, params['model_dir'] + '/params.json')
    pd.DataFrame(normalized_embedding_matrix).to_csv(os.path.join(
        params['model_dir'], 'normalized_embedding_matrix.tsv'),
                                                     index=False,
                                                     header=None,
                                                     sep='\t')
예제 #3
0
    # Set the logger
    set_logger(os.path.join(args.model_dir, 'evaluate.log'))

    # Get paths for vocabularies and dataset
    path_vocab = os.path.join(args.data_dir, 'vocab{}'.format(params.min_freq))
    params.vocab_path = path_vocab
    path_test_queries = os.path.join(args.data_dir, 'dev/queries.txt')
    path_test_articles = os.path.join(args.data_dir, 'dev/articles.txt')
    # Load Vocabularies
    vocab = tf.contrib.lookup.index_table_from_file(
        path_vocab, num_oov_buckets=num_oov_buckets, key_column_index=0)

    # Create the input data pipeline
    logging.info("Creating the dataset...")
    test_queries = load_dataset_from_text(path_test_queries, vocab, params)
    test_articles = load_dataset_from_text(path_test_articles, vocab, params)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.test_size
    params.id_pad_word = vocab.lookup(tf.constant(params.pad_word))

    # Create iterator over the test set
    inputs = input_fn('eval', test_queries, test_articles, params)
    logging.info("- done.")

    # Define the model
    logging.info("Creating the model...")
    model_spec = model_fn('eval', inputs, params, reuse=False)
    logging.info("- done.")
예제 #4
0
    # Set the logger
    set_logger(os.path.join(args.model_dir, 'evaluate.log'))

    # Get paths for vocabularies and dataset
    path_words = os.path.join(args.data_dir, 'words.txt')
    path_tags = os.path.join(args.data_dir, 'tags.txt')
    path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt')
    path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt')

    # Load Vocabularies
    words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets)
    tags = tf.contrib.lookup.index_table_from_file(path_tags)

    # Create the input data pipeline
    logging.info("Creating the dataset...")
    test_sentences = load_dataset_from_text(path_eval_sentences, words)
    test_labels = load_dataset_from_text(path_eval_labels, tags)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.test_size
    params.id_pad_word = words.lookup(tf.constant(params.pad_word))
    params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag))

    # Create iterator over the test set
    inputs = input_fn('eval', test_sentences, test_labels, params)
    logging.info("- done.")

    # Define the model
    logging.info("Creating the model...")
    model_spec = model_fn('eval', inputs, params, reuse=False)
    logging.info("- done.")
예제 #5
0
# Get paths for vocabularies and dataset
# path_words = os.path.join(args.data_dir, 'words_small.txt')
path_words = os.path.join(args.data_dir, 'words{}.txt'.format(toy))
path_sentiment_tags = os.path.join(args.data_dir, 'sentiment_tags.txt')
# path_reviews = os.path.join(args.data_dir, 'reviews_small.txt')
path_reviews = os.path.join(args.data_dir, 'reviews{}.txt'.format(toy))
path_sentiments = os.path.join(args.data_dir, 'sentiments{}.txt'.format(toy))
# path_sentiments = os.path.join(args.data_dir, 'sentiments.txt')

# Load vocabularies
words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets)
sentiments = tf.contrib.lookup.index_table_from_file(path_sentiment_tags)

# Create the input data pipeline
reviews = load_dataset_from_text(path_reviews,words)
review_sentiments = load_dataset_from_text(path_sentiments,sentiments, isLabels=True)

# Specify other parameters for the dataset and the model
params_sentiment.id_pad_word = words.lookup(tf.constant(params_sentiment.pad_word))
params_sentiment.id_pad_tag = words.lookup(tf.constant(params_sentiment.pad_tag))


# Create the iterator over the test set
inputs_sentiment = input_fn('eval', reviews, review_sentiments, params_sentiment)

# Define the model
print('Creating sentiment and era models...')
model_spec_sentiment = model_fn('eval', inputs_sentiment, params_sentiment, reuse=False)
print('Done')
# Get paths for vocabularies and dataset
# path_words = os.path.join(args.data_dir, 'words_small.txt')
path_words = os.path.join(args.data_dir, 'words{}.txt'.format(toy))
path_era_tags = os.path.join(args.data_dir, 'era_tags.txt')
# path_reviews = os.path.join(args.data_dir, 'reviews_small.txt')
path_reviews = os.path.join(args.data_dir, 'reviews{}.txt'.format(toy))
path_eras = os.path.join(args.data_dir, 'eras{}.txt'.format(toy))
# path_eras = os.path.join(args.data_dir, 'eras.txt')

# Load vocabularies
words = tf.contrib.lookup.index_table_from_file(
    path_words, num_oov_buckets=num_oov_buckets)
eras = tf.contrib.lookup.index_table_from_file(path_era_tags)

# Create the input data pipeline
reviews = load_dataset_from_text(path_reviews, words)
review_eras = load_dataset_from_text(path_eras, eras, isLabels=True)

# Specify other parameters for the dataset and the model
params_era.id_pad_word = words.lookup(tf.constant(params_era.pad_word))
params_era.id_pad_tag = words.lookup(tf.constant(params_era.pad_tag))

# Create the iterator over the test set
inputs_era = input_fn('eval', reviews, review_eras, params_era)

# Define the model
print('Creating era models...')
model_spec_era = model_fn('eval', inputs_era, params_era, reuse=False)
print('Done')

print(era_model_path)
예제 #7
0
    set_logger(os.path.join(args.model_dir, 'evaluate.log'))

    # Get paths for vocabularies and dataset
    path_words = os.path.join(args.data_dir, 'words.txt')
    path_tags = os.path.join(args.data_dir, 'tags.txt')
    path_eval_sentences = os.path.join(args.data_dir, 'test/sentences.txt')
    path_eval_labels = os.path.join(args.data_dir, 'test/labels.txt')

    # Load Vocabularies
    words = tf.contrib.lookup.index_table_from_file(
        path_words, num_oov_buckets=num_oov_buckets)
    tags = tf.contrib.lookup.index_table_from_file(path_tags)

    # Create the input data pipeline
    logging.info("Creating the dataset...")
    test_sentences = load_dataset_from_text(path_eval_sentences, words)
    test_labels = load_dataset_from_text(path_eval_labels, split=False)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.test_size
    params.id_pad_word = words.lookup(tf.constant(params.pad_word))
    params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag))

    # Create iterator over the test set
    inputs = input_fn('eval', test_sentences, test_labels, params)
    logging.info("- done.")

    # Define the model
    logging.info("Creating the model...")
    model_spec = model_fn('eval', inputs, params, reuse=False)
    logging.info("- done.")
예제 #8
0
    # Get paths for vocabularies and dataset
    path_words = os.path.join(args.data_dir, 'words.txt')
    path_tags = os.path.join(args.data_dir, 'tags.txt')
    path_train_sentences = os.path.join(args.data_dir, 'train/sentences.txt')
    path_train_labels = os.path.join(args.data_dir, 'train/labels.txt')
    path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt')
    path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt')

    # Load Vocabularies
    words = tf.contrib.lookup.index_table_from_file(
        path_words, num_oov_buckets=num_oov_buckets)
    tags = tf.contrib.lookup.index_table_from_file(path_tags)

    # Create the input data pipeline
    logging.info("Creating the datasets...")
    train_sentences = load_dataset_from_text(path_train_sentences, words)
    train_labels = load_dataset_from_text(path_train_labels, tags)
    eval_sentences = load_dataset_from_text(path_eval_sentences, words)
    eval_labels = load_dataset_from_text(path_eval_labels, tags)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.dev_size
    params.buffer_size = params.train_size  # buffer size for shuffling
    params.id_pad_word = words.lookup(tf.constant(params.pad_word))
    params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag))

    # Create the two iterators over the two datasets
    train_inputs = input_fn('train', train_sentences, train_labels, params)
    eval_inputs = input_fn('eval', eval_sentences, eval_labels, params)
    logging.info("- done.")
예제 #9
0
    # Get paths for vocabularies and dataset
    path_words = os.path.join(args.data_dir, 'words.txt')
    path_tags = os.path.join(args.data_dir, 'tags.txt')
    path_train_sentences = os.path.join(args.data_dir, 'train/sentences.txt')
    path_train_labels = os.path.join(args.data_dir, 'train/labels.txt')
    path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt')
    path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt')

    # Load Vocabularies
    words = tf.contrib.lookup.index_table_from_file(
        path_words, num_oov_buckets=num_oov_buckets)
    tags = tf.contrib.lookup.index_table_from_file(path_tags)

    # Create the input data pipeline
    logging.info("Creating the datasets...")
    train_sentences = load_dataset_from_text(path_train_sentences, words)
    train_labels = load_dataset_from_text(path_train_labels, split=False)

    # import numpy as np
    # with tf.Session() as sess:
    #     iterator = train_labels.make_one_shot_iterator()
    #     it = iterator.get_next()
    #     while True:
    #         inputs_eval = (sess.run(it))
    #         print(inputs_eval)

    eval_sentences = load_dataset_from_text(path_eval_sentences, words)
    eval_labels = load_dataset_from_text(path_eval_labels, split=False)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.dev_size
예제 #10
0
  if params.model_version == "sif": 
    vocab_freqs = np.loadtxt(params.vocab_path, usecols=1, comments=None, encoding='utf-8')
    vocab_freqs = np.append(vocab_freqs, 0.)
    vocab_freqs = tf.constant(vocab_freqs / np.sum(vocab_freqs), dtype=tf.float32) # frequency for OOV token
    params.vocab_freqs = vocab_freqs
  
  if params.model_version == "embedtf":
    with open(os.path.join(args.data_dir, "tfidfvec.pkl"), "rb") as f:
      tfidf = pickle.load(f)
    idf = tfidf.idf_.copy()
    idf = np.append(idf, idf[0])
    params.idf = idf

  # Create the input data pipeline
  logging.info("Creating the datasets...")
  train_queries = load_dataset_from_text(path_train_queries, vocab, params)
  train_articles = load_dataset_from_text(path_train_articles, vocab, params)
  eval_queries = load_dataset_from_text(path_eval_queries, vocab, params)
  eval_articles = load_dataset_from_text(path_eval_articles, vocab, params)

  # Specify other parameters for the dataset and the model
  params.eval_size = params.dev_size
  params.buffer_size = params.train_size # buffer size for shuffling
  params.id_pad_word = vocab.lookup(tf.constant(params.pad_word))

  # Create the two iterators over the two datasets
  train_inputs = input_fn('train', train_queries, train_articles, params)
  eval_inputs = input_fn('eval', eval_queries, eval_articles, params)
  logging.info("- done.")
  # Test input
  # with tf.Session() as sess:
예제 #11
0
    # Load the parameters from the dataset, that gives the size etc. into params
    json_path = os.path.join(args.data_dir, 'dataset_params.json')
    assert os.path.isfile(
        json_path), "No json file found at {}, run build.py".format(json_path)
    params.update(json_path)

    # Set the logger
    set_logger(os.path.join(args.model_dir, 'evaluate.log'))

    # Get paths for vocabularies and dataset
    path_eval_x = os.path.join(args.data_dir, 'dev.x')
    path_eval_y = os.path.join(args.data_dir, 'dev.y')

    # Create the input data pipeline
    logging.info("Creating the dataset...")
    test_sentences = load_dataset_from_text(path_eval_x)
    test_labels = load_dataset_from_text(path_eval_y)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.test_size

    # Create iterator over the test set
    inputs = input_fn('eval', test_sentences, test_labels, params)
    logging.info("- done.")

    # Define the model
    logging.info("Creating the model...")
    model_spec = model_fn('eval', inputs, params, reuse=False)
    logging.info("- done.")

    logging.info("Starting evaluation")