Exemplo n.º 1
0
def init():
    global vocab, vocab_size
    if vocab is None:
        logging.info('vocab:{}'.format(FLAGS.vocab))
        vocab = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS)
        vocab_size = vocab.size() if not FLAGS.vocab_size else min(
            vocab.size(), FLAGS.vocab_size)
        logging.info('vocab_size:{}'.format(vocab_size))
        assert vocab_size > NUM_RESERVED_IDS, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
Exemplo n.º 2
0
  def build_graph(self):
    """Build the graph for the full model."""
    opts = self._options

    self.vocab = Vocabulary(opts.vocab_path, 1) #num resevered ids is 1, <PAD> index 0
    opts.vocab_size = self.vocab.size()
    opts.vocab_counts = [int(self.vocab.freq(i)) for i in xrange(self.vocab.size())]
    print("Data file: ", opts.train_data)
    print("Vocab size: ", self.vocab.size())

    # The training data. A text file.
    (words_per_epoch, self._epoch, self._words, examples,
     labels) = word2vec.skipgram_word2vec(filename=opts.train_data,
                                          vocab_count=opts.vocab_counts,
                                          batch_size=opts.batch_size,
                                          window_size=opts.window_size,
                                          min_count=opts.min_count,
                                          subsample=opts.subsample)
    
    opts.words_per_epoch = self._session.run(words_per_epoch)
    print("Words per epoch: ", opts.words_per_epoch)
    self._examples = examples
    self._labels = labels

    true_logits, sampled_logits = self.forward(examples, labels)
    loss = self.nce_loss(true_logits, sampled_logits)
    tf.summary.scalar("loss", loss)
    self._loss = loss
    self.optimize(loss)

    ##TODO not work for eval
    # (_, _, _, eval_examples, eval_labels) = word2vec.skipgram_word2vec(filename=opts.eval_data,
    #                                         vocab_count=opts.vocab_counts,
    #                                         batch_size=opts.batch_size, #TODO must be same size as train right now
    #                                         window_size=opts.window_size,
    #                                         min_count=opts.min_count,
    #                                         subsample=0)
    # eval_true_logits, eval_sampled_logits = self.forward(eval_examples, eval_labels)
    # eval_loss = self.nce_loss(eval_true_logits, eval_sampled_logits)
    # tf.summary.scalar("eval loss", loss)
    # self._eval_loss = eval_loss

    # Properly initialize all variables.
    tf.global_variables_initializer().run()

    self.saver = tf.train.Saver()
Exemplo n.º 3
0
def init(vocab_path=None):
    global vocab, vocab_size
    if vocab is None:
        if vocab_path is None:
            vocab_path = FLAGS.vocab
        logging.info('vocab:{}'.format(vocab_path))
        logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids))
        vocab = Vocabulary(vocab_path, FLAGS.num_reserved_ids)
        vocab_size = vocab.size() if not FLAGS.vocab_size else min(
            vocab.size(), FLAGS.vocab_size)
        logging.info('vocab_size:{}'.format(vocab_size))
        assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
        logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()),
                                                   vocab.start_id()))
        logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()),
                                                 vocab.end_id()))
        logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()),
                                                 vocab.unk_id()))
Exemplo n.º 4
0
import sys, os
import multiprocessing
from multiprocessing import Process, Manager, Value

import numpy as np
import melt
import gezi

import nowarning
#from libsegment import *
#need ./data ./conf
#Segmentor.Init()
segmentor = gezi.Segmentor()

from libword_counter import Vocabulary
vocabulary = Vocabulary(FLAGS.vocab)

import conf
from conf import TEXT_MAX_WORDS, IMAGE_FEATURE_LEN

texts = []
text_strs = []

manager = Manager()
texts_dict = manager.dict()
text_strs_dict = manager.dict()
gtexts = [[]] * FLAGS.threads
gtext_strs = [[]] * FLAGS.threads

#how many records generated
counter = Value('i', 0)
Exemplo n.º 5
0
"""

import sys, os
import numpy as np
import melt
import gezi

#import libgezi

import conf
from conf import IMAGE_FEATURE_LEN, TEXT_MAX_WORDS, NUM_RESERVED_IDS, ENCODE_UNK

segmentor = gezi.Segmentor()

from libword_counter import Vocabulary
vocabulary = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS)

print('vocab:', FLAGS.vocab, file=sys.stderr)
assert vocabulary.size() > NUM_RESERVED_IDS
print('vocab size:', vocabulary.size(), file=sys.stderr)

writer = None
if FLAGS.mode != 2:
    gezi.try_mkdir(FLAGS.output_directory)

    outfile = '%s/%s_%s' % (FLAGS.output_directory, FLAGS.name, FLAGS.part)
    print('outfile:', outfile, file=sys.stderr)

    writer = melt.tfrecords.Writer(outfile)

num = 0
Exemplo n.º 6
0
#   \Description
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys, os
import tensorflow as tf
import melt

from libword_counter import Vocabulary

dir = '/home/gezi/new/temp/makeup/title2name/tfrecord/seq-basic/'

vocab = Vocabulary(os.path.join(dir, 'vocab.txt'), 1)

embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in')

wid = vocab.id('ÂüÐãÀ׶Ø')

wid_ = tf.placeholder(dtype=tf.int32, shape=[None, 1])
nids_ = embsim.nearby(wid_)

sess = embsim._sess

#nids = sess.run(nids_, {wid_: wid})
values, indices = sess.run(nids_, {wid_: [[wid]]})

for index, value in zip(indices[0], values[0]):
    print(vocab.key(int(index)), value)
Exemplo n.º 7
0
  
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

import gezi
import libsegment

seg = gezi.Segmentor()

print('\t'.join(seg.Segment('美女一定要支持')))
print('\x01'.join(seg.Segment('Oh q the same thing to me')))
print('\x01'.join(seg.Segment('Oh q the same thing to me', 'phrase_single')))
print('\x01'.join(seg.Segment('Oh q the same thing to me', 'phrase')))
print('\t'.join(seg.Segment('绿鹭')))
print('\t'.join(seg.segment('绿鹭')))
print('\t'.join(seg.segment_phrase('绿鹭')))
print('\t'.join(gezi.seg.Segment('绿鹭', libsegment.SEG_NEWWORD)))
print('\t'.join(gezi.seg.Segment('绿鹭')))

print('|'.join(gezi.segment_char('a baby is looking at 我的小伙伴oh 我不no no没关系 是不是   tian, that not ')))


from libword_counter import Vocabulary

v = Vocabulary('/home/gezi/temp/textsum/tfrecord/seq-basic.10w/train/vocab.txt', 2)
print(v.id('美女'))
print(v.key(v.id('美女')))
Exemplo n.º 8
0
    def build_graph(self):
        """Build the model graph."""
        opts = self._options

        self.vocab = Vocabulary(opts.vocab_path,
                                1)  #num resevered ids is 1, <PAD> index 0
        opts.vocab_size = self.vocab.size()
        opts.vocab_counts = [
            int(self.vocab.freq(i)) for i in xrange(self.vocab.size())
        ]
        print("Data file: ", opts.train_data)
        print("Vocab size: ", self.vocab.size())

        # The training data. A text file.
        (words_per_epoch, current_epoch, total_words_processed, examples,
         labels) = word2vec.skipgram_word2vec(filename=opts.train_data,
                                              vocab_count=opts.vocab_counts,
                                              batch_size=opts.batch_size,
                                              window_size=opts.window_size,
                                              min_count=opts.min_count,
                                              subsample=opts.subsample)
        opts.words_per_epoch = self._session.run(words_per_epoch)

        print("Words per epoch: ", opts.words_per_epoch)

        # Declare all variables we need.
        # Input words embedding: [vocab_size, emb_dim]
        w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim],
                                             -0.5 / opts.emb_dim,
                                             0.5 / opts.emb_dim),
                           name="w_in")

        tf.add_to_collection('word_embedding', w_in)

        # Global step: scalar, i.e., shape [].
        w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]),
                            name="w_out")

        # Global step: []
        global_step = tf.Variable(0, name="global_step")

        # Linear learning rate decay.
        words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
        lr = opts.learning_rate * tf.maximum(
            0.0001,
            1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

        # Training nodes.
        inc = global_step.assign_add(1)
        with tf.control_dependencies([inc]):
            train = word2vec.neg_train_word2vec(
                w_in,
                w_out,
                examples,
                labels,
                lr,
                vocab_count=opts.vocab_counts,
                num_negative_samples=opts.num_samples)

        self._w_in = w_in
        self._examples = examples
        self._labels = labels
        self._lr = lr
        self._train = train
        self.global_step = global_step
        self._epoch = current_epoch
        self._words = total_words_processed
Exemplo n.º 9
0
#python predict.py text_file model_dir vocab
#sys.stdin will be image feature file

import numpy as np
from libword_counter import Vocabulary
import gezi

from deepiu.image_caption.algos import algos_factory

WORDS_SEP = ' '
TEXT_MAX_WORDS = 80
NUM_RESERVED_IDS = 1
ENCODE_UNK = 0
IMAGE_FEATURE_LEN = 1000

vocabulary = Vocabulary(sys.argv[3], NUM_RESERVED_IDS)

algo = 'bow'
predictor = algos_factory.gen_predictor(algo)
predictor.init_predict(TEXT_MAX_WORDS)
predictor.load(sys.argv[2])

ids_list = []
for line in open(sys.argv[1]):
    line = line.strip().split('\t')[-1]
    words = line.split()
    ids = [
        vocabulary.id(word) for word in text.split(WORDS_SEP)
        if vocabulary.has(word) or ENCODE_UNK
    ]
    ids = gezi.pad(ids, TEXT_MAX_WORDS)