def init(): global vocab, vocab_size if vocab is None: logging.info('vocab:{}'.format(FLAGS.vocab)) vocab = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS) vocab_size = vocab.size() if not FLAGS.vocab_size else min( vocab.size(), FLAGS.vocab_size) logging.info('vocab_size:{}'.format(vocab_size)) assert vocab_size > NUM_RESERVED_IDS, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
def init(vocab_path=None): global vocab, vocab_size if vocab is None: if vocab_path is None: vocab_path = FLAGS.vocab logging.info('vocab:{}'.format(vocab_path)) logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids)) vocab = Vocabulary(vocab_path, FLAGS.num_reserved_ids) vocab_size = vocab.size() if not FLAGS.vocab_size else min( vocab.size(), FLAGS.vocab_size) logging.info('vocab_size:{}'.format(vocab_size)) assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()), vocab.start_id())) logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()), vocab.end_id())) logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()), vocab.unk_id()))
import numpy as np import melt import gezi #import libgezi import conf from conf import IMAGE_FEATURE_LEN, TEXT_MAX_WORDS, NUM_RESERVED_IDS, ENCODE_UNK segmentor = gezi.Segmentor() from libword_counter import Vocabulary vocabulary = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS) print('vocab:', FLAGS.vocab, file=sys.stderr) assert vocabulary.size() > NUM_RESERVED_IDS print('vocab size:', vocabulary.size(), file=sys.stderr) writer = None if FLAGS.mode != 2: gezi.try_mkdir(FLAGS.output_directory) outfile = '%s/%s_%s' % (FLAGS.output_directory, FLAGS.name, FLAGS.part) print('outfile:', outfile, file=sys.stderr) writer = melt.tfrecords.Writer(outfile) num = 0 count = 0 for line in sys.stdin: if num % 1000 == 0:
class Word2Vec(object): """Word2Vec model (Skipgram).""" def __init__(self, options, session): self._options = options self._session = session self.build_graph() self.build_eval_graph() def build_graph(self): """Build the model graph.""" opts = self._options self.vocab = Vocabulary(opts.vocab_path, 1) #num resevered ids is 1, <PAD> index 0 opts.vocab_size = self.vocab.size() opts.vocab_counts = [ int(self.vocab.freq(i)) for i in xrange(self.vocab.size()) ] print("Data file: ", opts.train_data) print("Vocab size: ", self.vocab.size()) # The training data. A text file. (words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram_word2vec(filename=opts.train_data, vocab_count=opts.vocab_counts, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) opts.words_per_epoch = self._session.run(words_per_epoch) print("Words per epoch: ", opts.words_per_epoch) # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") tf.add_to_collection('word_embedding', w_in) # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train_word2vec( w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts, num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.global_step = global_step self._epoch = current_epoch self._words = total_words_processed def sum_embeddings(self, nearby_word): nearby_emb = tf.gather(self._w_in, nearby_word) nearby_emb = tf.reduce_sum(nearby_emb, 0, keep_dims=True) nearby_emb = tf.nn.l2_normalize(nearby_emb, 1) return nearby_emb def build_eval_graph(self): """Build the evaluation graph.""" # Eval graph opts = self._options # Normalized word embeddings of shape [vocab_size, emb_dim]. nemb = tf.nn.l2_normalize(self._w_in, 1) # Nodes for computing neighbors for a given word according to # their cosine distance. nearby_word = tf.placeholder(dtype=tf.int32) # word id nearby_emb = self.sum_embeddings(nearby_word) nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True) tf.add_to_collection('nearby_dist', nearby_dist) nearby_val, nearby_idx = tf.nn.top_k(nearby_dist, min(1000, opts.vocab_size)) tf.add_to_collection('nearby_val', nearby_dist) tf.add_to_collection('nearby_idx', nearby_dist) nearby_word2 = tf.placeholder(dtype=tf.int32) nearby_emb2 = self.sum_embeddings(nearby_word2) textsim = tf.matmul(nearby_emb, nearby_emb2, transpose_b=True) self._nearby_word = nearby_word self._nearby_word2 = nearby_word2 self._nearby_val = nearby_val self._nearby_idx = nearby_idx self._textsim = textsim # Properly initialize all variables. tf.global_variables_initializer().run() self.saver = tf.train.Saver() def _train_thread_body(self): initial_epoch, = self._session.run([self._epoch]) while True: _, epoch = self._session.run([self._train, self._epoch]) if epoch != initial_epoch: break def train(self): """Train the model.""" opts = self._options initial_epoch, initial_words = self._session.run( [self._epoch, self._words]) workers = [] for _ in xrange(opts.concurrent_steps): t = threading.Thread(target=self._train_thread_body) t.start() workers.append(t) last_words, last_time = initial_words, time.time() while True: time.sleep(5) # Reports our progress once a while. (epoch, step, words, lr) = self._session.run( [self._epoch, self.global_step, self._words, self._lr]) now = time.time() last_words, last_time, rate = words, now, (words - last_words) / ( now - last_time) print("Epoch %4d Step %8d: lr = %5.3f words/sec = %8.0f\r" % (epoch, step, lr, rate), end="") sys.stdout.flush() if epoch != initial_epoch: break for t in workers: t.join() def eval(self): self.nearby('ÃæĤ') self.nearby('Èó˪') self.nearby('Èé˪') self.nearby('Ë® Èó˪') self.nearby('ÂüÐãÀ׶Ø') def nearby(self, words, num=50): """Prints out nearby words given a list of words.""" print(words) words = words.split() ids = np.array([self.vocab.id(x) for x in words]) vals, idx = self._session.run([self._nearby_val, self._nearby_idx], {self._nearby_word: ids}) i = 0 for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]): print("%s %f" % (self.vocab.key(int(neighbor)), distance), end=' ') print('') def textsim(self, text, text2): words = text.split() ids = [self.vocab.id(x) for x in words] words2 = text2.split() ids2 = [self.vocab.id(x) for x in words2] score = self._session.run(self._textsim, { self._nearby_word: ids, self._nearby_word2: ids2 }) print(text, '|', text2, ':', score) def dump_embedding(self, ofile): embedding = self._session.run(self._w_in) np.save(ofile, embedding)
class Word2Vec(object): """Word2Vec model (Skipgram).""" def __init__(self, options, session): self._options = options self._session = session self.build_graph() self.build_eval_graph() def forward(self, examples, labels): """Build the graph for the forward pass.""" opts = self._options # Declare all variables we need. # Embedding: [vocab_size, emb_dim] init_width = 0.5 / opts.emb_dim emb = tf.Variable( tf.random_uniform( [opts.vocab_size, opts.emb_dim], -init_width, init_width), name="emb") self._emb = emb # Softmax weight: [vocab_size, emb_dim]. Transposed. sm_w_t = tf.Variable( tf.zeros([opts.vocab_size, opts.emb_dim]), name="sm_w_t") # Softmax bias: [emb_dim]. sm_b = tf.Variable(tf.zeros([opts.vocab_size]), name="sm_b") # Global step: scalar, i.e., shape []. self.global_step = tf.Variable(0, name="global_step") # Nodes to compute the nce loss w/ candidate sampling. labels_matrix = tf.reshape( tf.cast(labels, dtype=tf.int64), [opts.batch_size, 1]) # Negative sampling. sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler( true_classes=labels_matrix, num_true=1, num_sampled=opts.num_samples, unique=True, range_max=opts.vocab_size, distortion=0.75, unigrams=opts.vocab_counts)) # Embeddings for examples: [batch_size, emb_dim] example_emb = tf.nn.embedding_lookup(emb, examples) # Weights for labels: [batch_size, emb_dim] true_w = tf.nn.embedding_lookup(sm_w_t, labels) # Biases for labels: [batch_size, 1] true_b = tf.nn.embedding_lookup(sm_b, labels) # Weights for sampled ids: [num_sampled, emb_dim] sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids) # Biases for sampled ids: [num_sampled, 1] sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids) # True logits: [batch_size, 1] true_logits = tf.reduce_sum(tf.multiply(example_emb, true_w), 1) + true_b # Sampled logits: [batch_size, num_sampled] # We replicate sampled noise labels for all examples in the batch # using the matmul. sampled_b_vec = tf.reshape(sampled_b, [opts.num_samples]) sampled_logits = tf.matmul(example_emb, sampled_w, transpose_b=True) + sampled_b_vec return true_logits, sampled_logits def nce_loss(self, true_logits, sampled_logits): """Build the graph for the NCE loss.""" # cross-entropy(logits, labels) opts = self._options true_xent = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(true_logits), logits=true_logits) sampled_xent = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.zeros_like(sampled_logits), logits=sampled_logits) # NCE-loss is the sum of the true and noise (sampled words) # contributions, averaged over the batch. nce_loss_tensor = (tf.reduce_sum(true_xent) + tf.reduce_sum(sampled_xent)) / opts.batch_size return nce_loss_tensor def optimize(self, loss): """Build the graph to optimize the loss function.""" # Optimizer nodes. # Linear learning rate decay. opts = self._options words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(self._words, tf.float32) / words_to_train) self._lr = lr optimizer = tf.train.GradientDescentOptimizer(lr) train = optimizer.minimize(loss, global_step=self.global_step, gate_gradients=optimizer.GATE_NONE) self._train = train def build_eval_graph(self): """Build the eval graph.""" # Eval graph opts = self._options # Normalized word embeddings of shape [vocab_size, emb_dim]. nemb = tf.nn.l2_normalize(self._emb, 1) # Nodes for computing neighbors for a given word according to # their cosine distance. nearby_word = tf.placeholder(dtype=tf.int32) # word id nearby_emb = tf.gather(self._emb, nearby_word) nearby_emb = tf.reduce_sum(nearby_emb, 0, keep_dims=True) nearby_emb = tf.nn.l2_normalize(nearby_emb, 1) nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True) tf.add_to_collection('nearby_dist', nearby_dist) nearby_val, nearby_idx = tf.nn.top_k(nearby_dist, min(1000, opts.vocab_size)) tf.add_to_collection('nearby_val', nearby_dist) tf.add_to_collection('nearby_idx', nearby_dist) self._nearby_word = nearby_word self._nearby_val = nearby_val self._nearby_idx = nearby_idx def build_graph(self): """Build the graph for the full model.""" opts = self._options self.vocab = Vocabulary(opts.vocab_path, 1) #num resevered ids is 1, <PAD> index 0 opts.vocab_size = self.vocab.size() opts.vocab_counts = [int(self.vocab.freq(i)) for i in xrange(self.vocab.size())] print("Data file: ", opts.train_data) print("Vocab size: ", self.vocab.size()) # The training data. A text file. (words_per_epoch, self._epoch, self._words, examples, labels) = word2vec.skipgram_word2vec(filename=opts.train_data, vocab_count=opts.vocab_counts, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) opts.words_per_epoch = self._session.run(words_per_epoch) print("Words per epoch: ", opts.words_per_epoch) self._examples = examples self._labels = labels true_logits, sampled_logits = self.forward(examples, labels) loss = self.nce_loss(true_logits, sampled_logits) tf.summary.scalar("loss", loss) self._loss = loss self.optimize(loss) ##TODO not work for eval # (_, _, _, eval_examples, eval_labels) = word2vec.skipgram_word2vec(filename=opts.eval_data, # vocab_count=opts.vocab_counts, # batch_size=opts.batch_size, #TODO must be same size as train right now # window_size=opts.window_size, # min_count=opts.min_count, # subsample=0) # eval_true_logits, eval_sampled_logits = self.forward(eval_examples, eval_labels) # eval_loss = self.nce_loss(eval_true_logits, eval_sampled_logits) # tf.summary.scalar("eval loss", loss) # self._eval_loss = eval_loss # Properly initialize all variables. tf.global_variables_initializer().run() self.saver = tf.train.Saver() def _train_thread_body(self): initial_epoch, = self._session.run([self._epoch]) while True: _, epoch = self._session.run([self._train, self._epoch]) if epoch != initial_epoch: break def train(self): """Train the model.""" opts = self._options initial_epoch, initial_words = self._session.run([self._epoch, self._words]) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(opts.save_path, self._session.graph) workers = [] for _ in xrange(opts.concurrent_steps): t = threading.Thread(target=self._train_thread_body) t.start() workers.append(t) last_words, last_time, last_summary_time = initial_words, time.time(), 0 last_checkpoint_time = 0 while True: time.sleep(opts.statistics_interval) # Reports our progress once a while. (epoch, step, loss, words, lr) = self._session.run( [self._epoch, self.global_step, self._loss, self._words, self._lr]) # (epoch, step, loss, eval_loss, words, lr) = self._session.run( # [self._epoch, self.global_step, self._loss, self._eval_loss, self._words, self._lr]) now = time.time() last_words, last_time, rate = words, now, (words - last_words) / ( now - last_time) # print("Epoch %4d Step %8d: lr = %5.3f loss = %6.2f eval_loss = %6.2f words/sec = %8.0f\r" % # (epoch, step, lr, loss, eval_loss, rate), end="") print("Epoch %4d Step %8d: lr = %5.3f loss = %6.2f words/sec = %8.0f\r" % (epoch, step, lr, loss, eval_loss, rate), end="") sys.stdout.flush() if now - last_summary_time > opts.summary_interval: summary_str = self._session.run(summary_op) summary_writer.add_summary(summary_str, step) last_summary_time = now if now - last_checkpoint_time > opts.checkpoint_interval: self.saver.save(self._session, os.path.join(opts.save_path, "model.ckpt"), global_step=step.astype(int)) last_checkpoint_time = now if epoch != initial_epoch: break for t in workers: t.join() return epoch def eval(self): self.nearby('nike') self.nearby('墨镜') self.nearby('手表') self.nearby('高 铁') self.nearby('我 的 家乡 惠州 越来 越 热 , 找 一款 喜欢 的 墨镜 很 重要') def nearby(self, words, num=50): """Prints out nearby words given a list of words.""" print(words) words = words.split() ids = np.array([self.vocab.id(x) for x in words]) vals, idx = self._session.run( [self._nearby_val, self._nearby_idx], {self._nearby_word: ids}) i = 0 for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]): print("%s %f" % (self.vocab.key(int(neighbor)), distance), end=' ') print('') def dump_embedding(self, ofile): embedding = self._session.run(self._emb) np.save(ofile, embedding)