def wordLookup(input_word, train_path=os.path.join(FLAGS.data_path, 'ptb.train.txt')): vocab = reader._build_vocab(train_path) if input_word in vocab: return vocab[input_word] else: print("Word is not present in vocab") return None
def get_PennTreeBank(data_dir=None): if data_dir is None: data_dir = get_dataset_path('ptb_data') if not os.path.isfile(os.path.join(data_dir, 'ptb.train.txt')): download(TRAIN_URL, data_dir) download(VALID_URL, data_dir) download(TEST_URL, data_dir) word_to_id = tfreader._build_vocab(os.path.join(data_dir, 'ptb.train.txt')) data3 = [np.asarray(tfreader._file_to_word_ids(os.path.join(data_dir, fname), word_to_id)) for fname in ['ptb.train.txt', 'ptb.valid.txt', 'ptb.test.txt']] return data3, word_to_id
def get_PennTreeBank(data_dir=None): if data_dir is None: data_dir = get_dataset_path('ptb_data') if not os.path.isfile(os.path.join(data_dir, 'ptb.train.txt')): download(TRAIN_URL, data_dir) download(VALID_URL, data_dir) download(TEST_URL, data_dir) word_to_id = tfreader._build_vocab(os.path.join(data_dir, 'ptb.train.txt')) data3 = [np.asarray(tfreader._file_to_word_ids(os.path.join(data_dir, fname), word_to_id)) for fname in ['ptb.train.txt', 'ptb.valid.txt', 'ptb.test.txt']] return data3, word_to_id
def get_tokens(self, data_path, input_type): word_to_id = reader._build_vocab( os.path.join(data_path, "wiki.train.txt")) tens = [i for i in range(10, 100, 10)] hundreds = [i for i in range(100, 1000, 100)] rounds_list = tens + hundreds rounds = set([word_to_id.get(str(i), 0) for i in rounds_list]) days = set([word_to_id.get(str(i + 1), 0) for i in range(0, 31)]) years = set([word_to_id.get(str(i + 1), 0) for i in range(999, 2020)]) if input_type == 'rounds': return rounds elif input_type == 'years': return years elif input_type == 'days': return days
def vizualizeTSNE(tsneEmbedding, path_to_save, path_to_train=os.path.join(FLAGS.data_path, 'ptb.train.txt'), samples=400): """ Makes visualization of random sample of t-SNE embedding and annotate with words, saves. """ vocab = reader._build_vocab(path_to_train) reverse_vocab = {v: k for k, v in vocab.iteritems()} random_ix = np.random.choice(tsneEmbedding.T.shape[0], samples) embadding_random = tsneEmbedding.T[random_ix].T keys = np.array(reverse_vocab.keys())[random_ix] plt.figure(figsize=(25, 25)) plt.scatter(embadding_random[0], embadding_random[1]) plt.title("TSNE Word Representation from random %s words " % samples) for i, txt in enumerate(embadding_random.T): plt.annotate(reverse_vocab[keys[i]], (embadding_random[0][i], embadding_random[1][i])) plt.savefig(path_to_save)
eval_config.num_steps = 1 size = 200 vocab_size = 10000 num_layers = 2 batch_size = 20 # W0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Matrix", [400, 800]) # b0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Bias", [800]) # W0 = session.run(W0) # b0 = session.run(b0) # W1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Matrix", [400, 800]) # b1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Bias", [800]) # W1 = session.run(W1) # b1 = session.run(b1) import reader voc = reader._build_vocab( '/Users/marting/scratch/tensorflow/simple-examples/data/ptb.train.txt') cov = {v: k for k, v in voc.items()} with tf.Graph().as_default(), tf.Session() as session: with tf.variable_scope("model", reuse=None, initializer=None): m = PTBModel(is_training=False, config=config) saver = tf.train.Saver() saver.restore(session, "/Users/marting/scratch/tensorflow/model.ckpt") with tf.variable_scope("model", reuse=True): print("Model restored.") embedding = tf.get_variable("embedding", [vocab_size, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) softmax_w = session.run(softmax_w) softmax_b = session.run(softmax_b) embedding = session.run(embedding)
eval_config.num_steps = 1 size = 200 vocab_size = 10000 num_layers = 2 batch_size = 20 # W0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Matrix", [400, 800]) # b0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Bias", [800]) # W0 = session.run(W0) # b0 = session.run(b0) # W1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Matrix", [400, 800]) # b1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Bias", [800]) # W1 = session.run(W1) # b1 = session.run(b1) import reader voc = reader._build_vocab('/Users/marting/scratch/tensorflow/simple-examples/data/ptb.train.txt') cov = {v:k for k,v in voc.items()} with tf.Graph().as_default(), tf.Session() as session: with tf.variable_scope("model", reuse=None, initializer=None): m = PTBModel(is_training=False, config=config) saver = tf.train.Saver() saver.restore(session, "/Users/marting/scratch/tensorflow/model.ckpt") with tf.variable_scope("model", reuse=True): print("Model restored.") embedding = tf.get_variable("embedding", [vocab_size, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) softmax_w = session.run(softmax_w) softmax_b = session.run(softmax_b) embedding = session.run(embedding)