예제 #1
0
def init():
    global vocab, vocab_size
    if vocab is None:
        logging.info('vocab:{}'.format(FLAGS.vocab))
        vocab = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS)
        vocab_size = vocab.size() if not FLAGS.vocab_size else min(
            vocab.size(), FLAGS.vocab_size)
        logging.info('vocab_size:{}'.format(vocab_size))
        assert vocab_size > NUM_RESERVED_IDS, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
예제 #2
0
def init(vocab_path=None):
    global vocab, vocab_size
    if vocab is None:
        if vocab_path is None:
            vocab_path = FLAGS.vocab
        logging.info('vocab:{}'.format(vocab_path))
        logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids))
        vocab = Vocabulary(vocab_path, FLAGS.num_reserved_ids)
        vocab_size = vocab.size() if not FLAGS.vocab_size else min(
            vocab.size(), FLAGS.vocab_size)
        logging.info('vocab_size:{}'.format(vocab_size))
        assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
        logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()),
                                                   vocab.start_id()))
        logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()),
                                                 vocab.end_id()))
        logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()),
                                                 vocab.unk_id()))
예제 #3
0
import numpy as np
import melt
import gezi

#import libgezi

import conf
from conf import IMAGE_FEATURE_LEN, TEXT_MAX_WORDS, NUM_RESERVED_IDS, ENCODE_UNK

segmentor = gezi.Segmentor()

from libword_counter import Vocabulary
vocabulary = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS)

print('vocab:', FLAGS.vocab, file=sys.stderr)
assert vocabulary.size() > NUM_RESERVED_IDS
print('vocab size:', vocabulary.size(), file=sys.stderr)

writer = None
if FLAGS.mode != 2:
    gezi.try_mkdir(FLAGS.output_directory)

    outfile = '%s/%s_%s' % (FLAGS.output_directory, FLAGS.name, FLAGS.part)
    print('outfile:', outfile, file=sys.stderr)

    writer = melt.tfrecords.Writer(outfile)

num = 0
count = 0
for line in sys.stdin:
    if num % 1000 == 0:
예제 #4
0
class Word2Vec(object):
    """Word2Vec model (Skipgram)."""
    def __init__(self, options, session):
        self._options = options
        self._session = session
        self.build_graph()
        self.build_eval_graph()

    def build_graph(self):
        """Build the model graph."""
        opts = self._options

        self.vocab = Vocabulary(opts.vocab_path,
                                1)  #num resevered ids is 1, <PAD> index 0
        opts.vocab_size = self.vocab.size()
        opts.vocab_counts = [
            int(self.vocab.freq(i)) for i in xrange(self.vocab.size())
        ]
        print("Data file: ", opts.train_data)
        print("Vocab size: ", self.vocab.size())

        # The training data. A text file.
        (words_per_epoch, current_epoch, total_words_processed, examples,
         labels) = word2vec.skipgram_word2vec(filename=opts.train_data,
                                              vocab_count=opts.vocab_counts,
                                              batch_size=opts.batch_size,
                                              window_size=opts.window_size,
                                              min_count=opts.min_count,
                                              subsample=opts.subsample)
        opts.words_per_epoch = self._session.run(words_per_epoch)

        print("Words per epoch: ", opts.words_per_epoch)

        # Declare all variables we need.
        # Input words embedding: [vocab_size, emb_dim]
        w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim],
                                             -0.5 / opts.emb_dim,
                                             0.5 / opts.emb_dim),
                           name="w_in")

        tf.add_to_collection('word_embedding', w_in)

        # Global step: scalar, i.e., shape [].
        w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]),
                            name="w_out")

        # Global step: []
        global_step = tf.Variable(0, name="global_step")

        # Linear learning rate decay.
        words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
        lr = opts.learning_rate * tf.maximum(
            0.0001,
            1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

        # Training nodes.
        inc = global_step.assign_add(1)
        with tf.control_dependencies([inc]):
            train = word2vec.neg_train_word2vec(
                w_in,
                w_out,
                examples,
                labels,
                lr,
                vocab_count=opts.vocab_counts,
                num_negative_samples=opts.num_samples)

        self._w_in = w_in
        self._examples = examples
        self._labels = labels
        self._lr = lr
        self._train = train
        self.global_step = global_step
        self._epoch = current_epoch
        self._words = total_words_processed

    def sum_embeddings(self, nearby_word):
        nearby_emb = tf.gather(self._w_in, nearby_word)
        nearby_emb = tf.reduce_sum(nearby_emb, 0, keep_dims=True)
        nearby_emb = tf.nn.l2_normalize(nearby_emb, 1)
        return nearby_emb

    def build_eval_graph(self):
        """Build the evaluation graph."""
        # Eval graph
        opts = self._options

        # Normalized word embeddings of shape [vocab_size, emb_dim].
        nemb = tf.nn.l2_normalize(self._w_in, 1)

        # Nodes for computing neighbors for a given word according to
        # their cosine distance.
        nearby_word = tf.placeholder(dtype=tf.int32)  # word id
        nearby_emb = self.sum_embeddings(nearby_word)
        nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True)
        tf.add_to_collection('nearby_dist', nearby_dist)
        nearby_val, nearby_idx = tf.nn.top_k(nearby_dist,
                                             min(1000, opts.vocab_size))
        tf.add_to_collection('nearby_val', nearby_dist)
        tf.add_to_collection('nearby_idx', nearby_dist)

        nearby_word2 = tf.placeholder(dtype=tf.int32)
        nearby_emb2 = self.sum_embeddings(nearby_word2)
        textsim = tf.matmul(nearby_emb, nearby_emb2, transpose_b=True)

        self._nearby_word = nearby_word
        self._nearby_word2 = nearby_word2
        self._nearby_val = nearby_val
        self._nearby_idx = nearby_idx
        self._textsim = textsim

        # Properly initialize all variables.
        tf.global_variables_initializer().run()

        self.saver = tf.train.Saver()

    def _train_thread_body(self):
        initial_epoch, = self._session.run([self._epoch])
        while True:
            _, epoch = self._session.run([self._train, self._epoch])
            if epoch != initial_epoch:
                break

    def train(self):
        """Train the model."""
        opts = self._options

        initial_epoch, initial_words = self._session.run(
            [self._epoch, self._words])

        workers = []
        for _ in xrange(opts.concurrent_steps):
            t = threading.Thread(target=self._train_thread_body)
            t.start()
            workers.append(t)

        last_words, last_time = initial_words, time.time()
        while True:
            time.sleep(5)  # Reports our progress once a while.
            (epoch, step, words, lr) = self._session.run(
                [self._epoch, self.global_step, self._words, self._lr])
            now = time.time()
            last_words, last_time, rate = words, now, (words - last_words) / (
                now - last_time)
            print("Epoch %4d Step %8d: lr = %5.3f words/sec = %8.0f\r" %
                  (epoch, step, lr, rate),
                  end="")
            sys.stdout.flush()
            if epoch != initial_epoch:
                break

        for t in workers:
            t.join()

    def eval(self):
        self.nearby('ÃæĤ')
        self.nearby('Èó˪')
        self.nearby('Èé˪')
        self.nearby('Ë® Èó˪')
        self.nearby('ÂüÐãÀ׶Ø')

    def nearby(self, words, num=50):
        """Prints out nearby words given a list of words."""
        print(words)
        words = words.split()
        ids = np.array([self.vocab.id(x) for x in words])
        vals, idx = self._session.run([self._nearby_val, self._nearby_idx],
                                      {self._nearby_word: ids})
        i = 0
        for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
            print("%s %f" % (self.vocab.key(int(neighbor)), distance), end=' ')
        print('')

    def textsim(self, text, text2):
        words = text.split()
        ids = [self.vocab.id(x) for x in words]
        words2 = text2.split()
        ids2 = [self.vocab.id(x) for x in words2]
        score = self._session.run(self._textsim, {
            self._nearby_word: ids,
            self._nearby_word2: ids2
        })
        print(text, '|', text2, ':', score)

    def dump_embedding(self, ofile):
        embedding = self._session.run(self._w_in)
        np.save(ofile, embedding)
예제 #5
0
class Word2Vec(object):
  """Word2Vec model (Skipgram)."""

  def __init__(self, options, session):
    self._options = options
    self._session = session
    self.build_graph()
    self.build_eval_graph()

  def forward(self, examples, labels):
    """Build the graph for the forward pass."""
    opts = self._options

    # Declare all variables we need.
    # Embedding: [vocab_size, emb_dim]
    init_width = 0.5 / opts.emb_dim
    emb = tf.Variable(
        tf.random_uniform(
            [opts.vocab_size, opts.emb_dim], -init_width, init_width),
        name="emb")
    self._emb = emb

    # Softmax weight: [vocab_size, emb_dim]. Transposed.
    sm_w_t = tf.Variable(
        tf.zeros([opts.vocab_size, opts.emb_dim]),
        name="sm_w_t")

    # Softmax bias: [emb_dim].
    sm_b = tf.Variable(tf.zeros([opts.vocab_size]), name="sm_b")

    # Global step: scalar, i.e., shape [].
    self.global_step = tf.Variable(0, name="global_step")

    # Nodes to compute the nce loss w/ candidate sampling.
    labels_matrix = tf.reshape(
        tf.cast(labels,
                dtype=tf.int64),
        [opts.batch_size, 1])

    # Negative sampling.
    sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
        true_classes=labels_matrix,
        num_true=1,
        num_sampled=opts.num_samples,
        unique=True,
        range_max=opts.vocab_size,
        distortion=0.75,
        unigrams=opts.vocab_counts))

    # Embeddings for examples: [batch_size, emb_dim]
    example_emb = tf.nn.embedding_lookup(emb, examples)

    # Weights for labels: [batch_size, emb_dim]
    true_w = tf.nn.embedding_lookup(sm_w_t, labels)
    # Biases for labels: [batch_size, 1]
    true_b = tf.nn.embedding_lookup(sm_b, labels)

    # Weights for sampled ids: [num_sampled, emb_dim]
    sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids)
    # Biases for sampled ids: [num_sampled, 1]
    sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids)

    # True logits: [batch_size, 1]
    true_logits = tf.reduce_sum(tf.multiply(example_emb, true_w), 1) + true_b

    # Sampled logits: [batch_size, num_sampled]
    # We replicate sampled noise labels for all examples in the batch
    # using the matmul.
    sampled_b_vec = tf.reshape(sampled_b, [opts.num_samples])
    sampled_logits = tf.matmul(example_emb,
                               sampled_w,
                               transpose_b=True) + sampled_b_vec
    return true_logits, sampled_logits

  def nce_loss(self, true_logits, sampled_logits):
    """Build the graph for the NCE loss."""

    # cross-entropy(logits, labels)
    opts = self._options
    true_xent = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.ones_like(true_logits), logits=true_logits)
    sampled_xent = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.zeros_like(sampled_logits), logits=sampled_logits)

    # NCE-loss is the sum of the true and noise (sampled words)
    # contributions, averaged over the batch.
    nce_loss_tensor = (tf.reduce_sum(true_xent) +
                       tf.reduce_sum(sampled_xent)) / opts.batch_size
    return nce_loss_tensor

  def optimize(self, loss):
    """Build the graph to optimize the loss function."""

    # Optimizer nodes.
    # Linear learning rate decay.
    opts = self._options
    words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
    lr = opts.learning_rate * tf.maximum(
        0.0001, 1.0 - tf.cast(self._words, tf.float32) / words_to_train)
    self._lr = lr
    optimizer = tf.train.GradientDescentOptimizer(lr)
    train = optimizer.minimize(loss,
                               global_step=self.global_step,
                               gate_gradients=optimizer.GATE_NONE)
    self._train = train

  def build_eval_graph(self):
    """Build the eval graph."""
    # Eval graph
    opts = self._options
    # Normalized word embeddings of shape [vocab_size, emb_dim].
    nemb = tf.nn.l2_normalize(self._emb, 1)

    # Nodes for computing neighbors for a given word according to
    # their cosine distance.
    nearby_word = tf.placeholder(dtype=tf.int32)  # word id
    nearby_emb = tf.gather(self._emb, nearby_word)
    nearby_emb = tf.reduce_sum(nearby_emb, 0, keep_dims=True)
    nearby_emb = tf.nn.l2_normalize(nearby_emb, 1)
    nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True)
    tf.add_to_collection('nearby_dist', nearby_dist)
    nearby_val, nearby_idx = tf.nn.top_k(nearby_dist,
                                         min(1000, opts.vocab_size))
    tf.add_to_collection('nearby_val', nearby_dist)
    tf.add_to_collection('nearby_idx', nearby_dist)


    self._nearby_word = nearby_word
    self._nearby_val = nearby_val
    self._nearby_idx = nearby_idx

  def build_graph(self):
    """Build the graph for the full model."""
    opts = self._options

    self.vocab = Vocabulary(opts.vocab_path, 1) #num resevered ids is 1, <PAD> index 0
    opts.vocab_size = self.vocab.size()
    opts.vocab_counts = [int(self.vocab.freq(i)) for i in xrange(self.vocab.size())]
    print("Data file: ", opts.train_data)
    print("Vocab size: ", self.vocab.size())

    # The training data. A text file.
    (words_per_epoch, self._epoch, self._words, examples,
     labels) = word2vec.skipgram_word2vec(filename=opts.train_data,
                                          vocab_count=opts.vocab_counts,
                                          batch_size=opts.batch_size,
                                          window_size=opts.window_size,
                                          min_count=opts.min_count,
                                          subsample=opts.subsample)
    
    opts.words_per_epoch = self._session.run(words_per_epoch)
    print("Words per epoch: ", opts.words_per_epoch)
    self._examples = examples
    self._labels = labels

    true_logits, sampled_logits = self.forward(examples, labels)
    loss = self.nce_loss(true_logits, sampled_logits)
    tf.summary.scalar("loss", loss)
    self._loss = loss
    self.optimize(loss)

    ##TODO not work for eval
    # (_, _, _, eval_examples, eval_labels) = word2vec.skipgram_word2vec(filename=opts.eval_data,
    #                                         vocab_count=opts.vocab_counts,
    #                                         batch_size=opts.batch_size, #TODO must be same size as train right now
    #                                         window_size=opts.window_size,
    #                                         min_count=opts.min_count,
    #                                         subsample=0)
    # eval_true_logits, eval_sampled_logits = self.forward(eval_examples, eval_labels)
    # eval_loss = self.nce_loss(eval_true_logits, eval_sampled_logits)
    # tf.summary.scalar("eval loss", loss)
    # self._eval_loss = eval_loss

    # Properly initialize all variables.
    tf.global_variables_initializer().run()

    self.saver = tf.train.Saver()


  def _train_thread_body(self):
    initial_epoch, = self._session.run([self._epoch])
    while True:
      _, epoch = self._session.run([self._train, self._epoch])
      if epoch != initial_epoch:
        break

  def train(self):
    """Train the model."""
    opts = self._options

    initial_epoch, initial_words = self._session.run([self._epoch, self._words])

    summary_op = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(opts.save_path, self._session.graph)
    workers = []
    for _ in xrange(opts.concurrent_steps):
      t = threading.Thread(target=self._train_thread_body)
      t.start()
      workers.append(t)

    last_words, last_time, last_summary_time = initial_words, time.time(), 0
    last_checkpoint_time = 0
    while True:
      time.sleep(opts.statistics_interval)  # Reports our progress once a while.
      (epoch, step, loss, words, lr) = self._session.run(
          [self._epoch, self.global_step, self._loss, self._words, self._lr])
      # (epoch, step, loss, eval_loss, words, lr) = self._session.run(
      #     [self._epoch, self.global_step, self._loss, self._eval_loss, self._words, self._lr])
      now = time.time()
      last_words, last_time, rate = words, now, (words - last_words) / (
          now - last_time)
      # print("Epoch %4d Step %8d: lr = %5.3f loss = %6.2f eval_loss = %6.2f words/sec = %8.0f\r" %
      #       (epoch, step, lr, loss, eval_loss, rate), end="")
      print("Epoch %4d Step %8d: lr = %5.3f loss = %6.2f words/sec = %8.0f\r" %
            (epoch, step, lr, loss, eval_loss, rate), end="")
      sys.stdout.flush()
      if now - last_summary_time > opts.summary_interval:
        summary_str = self._session.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        last_summary_time = now
      if now - last_checkpoint_time > opts.checkpoint_interval:
        self.saver.save(self._session,
                        os.path.join(opts.save_path, "model.ckpt"),
                        global_step=step.astype(int))
        last_checkpoint_time = now
      if epoch != initial_epoch:
        break

    for t in workers:
      t.join()

    return epoch


  def eval(self):
    self.nearby('nike')
    self.nearby('墨镜')
    self.nearby('手表')
    self.nearby('高 铁')
    self.nearby('我 的 家乡 惠州 越来 越 热 , 找 一款 喜欢 的 墨镜 很 重要')

  def nearby(self, words, num=50):
    """Prints out nearby words given a list of words."""
    print(words)
    words = words.split()
    ids = np.array([self.vocab.id(x) for x in words])
    vals, idx = self._session.run(
        [self._nearby_val, self._nearby_idx], {self._nearby_word: ids})
    i = 0
    for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
      print("%s %f" % (self.vocab.key(int(neighbor)), distance), end=' ')
    print('')

    def dump_embedding(self, ofile):
      embedding = self._session.run(self._emb)
      np.save(ofile, embedding)