def test_load_dataset_from_text(): root_dir = dirname(dirname(dirname(realpath(__file__)))) path_txt = join(root_dir, 'data/small/train/source.txt') symbol_index = index_table_from_text(path_txt, sos=True, eos=True) dataset = load_dataset_from_text(join(root_dir, path_txt), symbol_index) with open(join(root_dir, path_txt), 'r') as f: num_lines = len(f.read().strip().split('\n')) assert num_lines == len(dataset) dataset_with_mark = load_dataset_from_text(join(root_dir, path_txt), symbol_index, sos=True, eos=True) assert len(dataset[0]) + 2 == len(dataset_with_mark[0])
def train(): # Set the logger set_logger(os.path.join(params['model_dir'], 'train.log')) # log params logging.info(params) # Load vacabulary vocab = tf.contrib.lookup.index_table_from_file(vocab_path, num_oov_buckets=1) # Create the input data pipeline logging.info('Creating the datasets...') train_input_words = load_dataset_from_text(data_dir, train_input_filename, vocab) train_context_words = load_dataset_from_text(data_dir, train_context_filename, vocab) # Create the iterator over the dataset train_inputs = input_fn('train', train_input_words, train_context_words, params) eval_inputs = input_fn('eval', train_input_words, train_context_words, params) logging.info("- done") # Define the model logging.info('Creating the model...') train_model_spec = model_fn('train', train_inputs, params, reuse=tf.AUTO_REUSE) eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True) logging.info('- done.') # Train the model logging.info('Starting training for {} epochs'.format( params['num_epochs'])) normalized_embedding_matrix = train_and_evaluate(train_model_spec, eval_model_spec, params) save_dict_to_json(params, params['model_dir'] + '/params.json') pd.DataFrame(normalized_embedding_matrix).to_csv(os.path.join( params['model_dir'], 'normalized_embedding_matrix.tsv'), index=False, header=None, sep='\t')
# Set the logger set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Get paths for vocabularies and dataset path_vocab = os.path.join(args.data_dir, 'vocab{}'.format(params.min_freq)) params.vocab_path = path_vocab path_test_queries = os.path.join(args.data_dir, 'dev/queries.txt') path_test_articles = os.path.join(args.data_dir, 'dev/articles.txt') # Load Vocabularies vocab = tf.contrib.lookup.index_table_from_file( path_vocab, num_oov_buckets=num_oov_buckets, key_column_index=0) # Create the input data pipeline logging.info("Creating the dataset...") test_queries = load_dataset_from_text(path_test_queries, vocab, params) test_articles = load_dataset_from_text(path_test_articles, vocab, params) # Specify other parameters for the dataset and the model params.eval_size = params.test_size params.id_pad_word = vocab.lookup(tf.constant(params.pad_word)) # Create iterator over the test set inputs = input_fn('eval', test_queries, test_articles, params) logging.info("- done.") # Define the model logging.info("Creating the model...") model_spec = model_fn('eval', inputs, params, reuse=False) logging.info("- done.")
# Set the logger set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Get paths for vocabularies and dataset path_words = os.path.join(args.data_dir, 'words.txt') path_tags = os.path.join(args.data_dir, 'tags.txt') path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt') path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt') # Load Vocabularies words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets) tags = tf.contrib.lookup.index_table_from_file(path_tags) # Create the input data pipeline logging.info("Creating the dataset...") test_sentences = load_dataset_from_text(path_eval_sentences, words) test_labels = load_dataset_from_text(path_eval_labels, tags) # Specify other parameters for the dataset and the model params.eval_size = params.test_size params.id_pad_word = words.lookup(tf.constant(params.pad_word)) params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag)) # Create iterator over the test set inputs = input_fn('eval', test_sentences, test_labels, params) logging.info("- done.") # Define the model logging.info("Creating the model...") model_spec = model_fn('eval', inputs, params, reuse=False) logging.info("- done.")
# Get paths for vocabularies and dataset # path_words = os.path.join(args.data_dir, 'words_small.txt') path_words = os.path.join(args.data_dir, 'words{}.txt'.format(toy)) path_sentiment_tags = os.path.join(args.data_dir, 'sentiment_tags.txt') # path_reviews = os.path.join(args.data_dir, 'reviews_small.txt') path_reviews = os.path.join(args.data_dir, 'reviews{}.txt'.format(toy)) path_sentiments = os.path.join(args.data_dir, 'sentiments{}.txt'.format(toy)) # path_sentiments = os.path.join(args.data_dir, 'sentiments.txt') # Load vocabularies words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets) sentiments = tf.contrib.lookup.index_table_from_file(path_sentiment_tags) # Create the input data pipeline reviews = load_dataset_from_text(path_reviews,words) review_sentiments = load_dataset_from_text(path_sentiments,sentiments, isLabels=True) # Specify other parameters for the dataset and the model params_sentiment.id_pad_word = words.lookup(tf.constant(params_sentiment.pad_word)) params_sentiment.id_pad_tag = words.lookup(tf.constant(params_sentiment.pad_tag)) # Create the iterator over the test set inputs_sentiment = input_fn('eval', reviews, review_sentiments, params_sentiment) # Define the model print('Creating sentiment and era models...') model_spec_sentiment = model_fn('eval', inputs_sentiment, params_sentiment, reuse=False) print('Done')
# Get paths for vocabularies and dataset # path_words = os.path.join(args.data_dir, 'words_small.txt') path_words = os.path.join(args.data_dir, 'words{}.txt'.format(toy)) path_era_tags = os.path.join(args.data_dir, 'era_tags.txt') # path_reviews = os.path.join(args.data_dir, 'reviews_small.txt') path_reviews = os.path.join(args.data_dir, 'reviews{}.txt'.format(toy)) path_eras = os.path.join(args.data_dir, 'eras{}.txt'.format(toy)) # path_eras = os.path.join(args.data_dir, 'eras.txt') # Load vocabularies words = tf.contrib.lookup.index_table_from_file( path_words, num_oov_buckets=num_oov_buckets) eras = tf.contrib.lookup.index_table_from_file(path_era_tags) # Create the input data pipeline reviews = load_dataset_from_text(path_reviews, words) review_eras = load_dataset_from_text(path_eras, eras, isLabels=True) # Specify other parameters for the dataset and the model params_era.id_pad_word = words.lookup(tf.constant(params_era.pad_word)) params_era.id_pad_tag = words.lookup(tf.constant(params_era.pad_tag)) # Create the iterator over the test set inputs_era = input_fn('eval', reviews, review_eras, params_era) # Define the model print('Creating era models...') model_spec_era = model_fn('eval', inputs_era, params_era, reuse=False) print('Done') print(era_model_path)
set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Get paths for vocabularies and dataset path_words = os.path.join(args.data_dir, 'words.txt') path_tags = os.path.join(args.data_dir, 'tags.txt') path_eval_sentences = os.path.join(args.data_dir, 'test/sentences.txt') path_eval_labels = os.path.join(args.data_dir, 'test/labels.txt') # Load Vocabularies words = tf.contrib.lookup.index_table_from_file( path_words, num_oov_buckets=num_oov_buckets) tags = tf.contrib.lookup.index_table_from_file(path_tags) # Create the input data pipeline logging.info("Creating the dataset...") test_sentences = load_dataset_from_text(path_eval_sentences, words) test_labels = load_dataset_from_text(path_eval_labels, split=False) # Specify other parameters for the dataset and the model params.eval_size = params.test_size params.id_pad_word = words.lookup(tf.constant(params.pad_word)) params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag)) # Create iterator over the test set inputs = input_fn('eval', test_sentences, test_labels, params) logging.info("- done.") # Define the model logging.info("Creating the model...") model_spec = model_fn('eval', inputs, params, reuse=False) logging.info("- done.")
# Get paths for vocabularies and dataset path_words = os.path.join(args.data_dir, 'words.txt') path_tags = os.path.join(args.data_dir, 'tags.txt') path_train_sentences = os.path.join(args.data_dir, 'train/sentences.txt') path_train_labels = os.path.join(args.data_dir, 'train/labels.txt') path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt') path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt') # Load Vocabularies words = tf.contrib.lookup.index_table_from_file( path_words, num_oov_buckets=num_oov_buckets) tags = tf.contrib.lookup.index_table_from_file(path_tags) # Create the input data pipeline logging.info("Creating the datasets...") train_sentences = load_dataset_from_text(path_train_sentences, words) train_labels = load_dataset_from_text(path_train_labels, tags) eval_sentences = load_dataset_from_text(path_eval_sentences, words) eval_labels = load_dataset_from_text(path_eval_labels, tags) # Specify other parameters for the dataset and the model params.eval_size = params.dev_size params.buffer_size = params.train_size # buffer size for shuffling params.id_pad_word = words.lookup(tf.constant(params.pad_word)) params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag)) # Create the two iterators over the two datasets train_inputs = input_fn('train', train_sentences, train_labels, params) eval_inputs = input_fn('eval', eval_sentences, eval_labels, params) logging.info("- done.")
# Get paths for vocabularies and dataset path_words = os.path.join(args.data_dir, 'words.txt') path_tags = os.path.join(args.data_dir, 'tags.txt') path_train_sentences = os.path.join(args.data_dir, 'train/sentences.txt') path_train_labels = os.path.join(args.data_dir, 'train/labels.txt') path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt') path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt') # Load Vocabularies words = tf.contrib.lookup.index_table_from_file( path_words, num_oov_buckets=num_oov_buckets) tags = tf.contrib.lookup.index_table_from_file(path_tags) # Create the input data pipeline logging.info("Creating the datasets...") train_sentences = load_dataset_from_text(path_train_sentences, words) train_labels = load_dataset_from_text(path_train_labels, split=False) # import numpy as np # with tf.Session() as sess: # iterator = train_labels.make_one_shot_iterator() # it = iterator.get_next() # while True: # inputs_eval = (sess.run(it)) # print(inputs_eval) eval_sentences = load_dataset_from_text(path_eval_sentences, words) eval_labels = load_dataset_from_text(path_eval_labels, split=False) # Specify other parameters for the dataset and the model params.eval_size = params.dev_size
if params.model_version == "sif": vocab_freqs = np.loadtxt(params.vocab_path, usecols=1, comments=None, encoding='utf-8') vocab_freqs = np.append(vocab_freqs, 0.) vocab_freqs = tf.constant(vocab_freqs / np.sum(vocab_freqs), dtype=tf.float32) # frequency for OOV token params.vocab_freqs = vocab_freqs if params.model_version == "embedtf": with open(os.path.join(args.data_dir, "tfidfvec.pkl"), "rb") as f: tfidf = pickle.load(f) idf = tfidf.idf_.copy() idf = np.append(idf, idf[0]) params.idf = idf # Create the input data pipeline logging.info("Creating the datasets...") train_queries = load_dataset_from_text(path_train_queries, vocab, params) train_articles = load_dataset_from_text(path_train_articles, vocab, params) eval_queries = load_dataset_from_text(path_eval_queries, vocab, params) eval_articles = load_dataset_from_text(path_eval_articles, vocab, params) # Specify other parameters for the dataset and the model params.eval_size = params.dev_size params.buffer_size = params.train_size # buffer size for shuffling params.id_pad_word = vocab.lookup(tf.constant(params.pad_word)) # Create the two iterators over the two datasets train_inputs = input_fn('train', train_queries, train_articles, params) eval_inputs = input_fn('eval', eval_queries, eval_articles, params) logging.info("- done.") # Test input # with tf.Session() as sess:
# Load the parameters from the dataset, that gives the size etc. into params json_path = os.path.join(args.data_dir, 'dataset_params.json') assert os.path.isfile( json_path), "No json file found at {}, run build.py".format(json_path) params.update(json_path) # Set the logger set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Get paths for vocabularies and dataset path_eval_x = os.path.join(args.data_dir, 'dev.x') path_eval_y = os.path.join(args.data_dir, 'dev.y') # Create the input data pipeline logging.info("Creating the dataset...") test_sentences = load_dataset_from_text(path_eval_x) test_labels = load_dataset_from_text(path_eval_y) # Specify other parameters for the dataset and the model params.eval_size = params.test_size # Create iterator over the test set inputs = input_fn('eval', test_sentences, test_labels, params) logging.info("- done.") # Define the model logging.info("Creating the model...") model_spec = model_fn('eval', inputs, params, reuse=False) logging.info("- done.") logging.info("Starting evaluation")