def init(): global vocab, vocab_size if vocab is None: logging.info('vocab:{}'.format(FLAGS.vocab)) vocab = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS) vocab_size = vocab.size() if not FLAGS.vocab_size else min( vocab.size(), FLAGS.vocab_size) logging.info('vocab_size:{}'.format(vocab_size)) assert vocab_size > NUM_RESERVED_IDS, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
def build_graph(self): """Build the graph for the full model.""" opts = self._options self.vocab = Vocabulary(opts.vocab_path, 1) #num resevered ids is 1, <PAD> index 0 opts.vocab_size = self.vocab.size() opts.vocab_counts = [int(self.vocab.freq(i)) for i in xrange(self.vocab.size())] print("Data file: ", opts.train_data) print("Vocab size: ", self.vocab.size()) # The training data. A text file. (words_per_epoch, self._epoch, self._words, examples, labels) = word2vec.skipgram_word2vec(filename=opts.train_data, vocab_count=opts.vocab_counts, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) opts.words_per_epoch = self._session.run(words_per_epoch) print("Words per epoch: ", opts.words_per_epoch) self._examples = examples self._labels = labels true_logits, sampled_logits = self.forward(examples, labels) loss = self.nce_loss(true_logits, sampled_logits) tf.summary.scalar("loss", loss) self._loss = loss self.optimize(loss) ##TODO not work for eval # (_, _, _, eval_examples, eval_labels) = word2vec.skipgram_word2vec(filename=opts.eval_data, # vocab_count=opts.vocab_counts, # batch_size=opts.batch_size, #TODO must be same size as train right now # window_size=opts.window_size, # min_count=opts.min_count, # subsample=0) # eval_true_logits, eval_sampled_logits = self.forward(eval_examples, eval_labels) # eval_loss = self.nce_loss(eval_true_logits, eval_sampled_logits) # tf.summary.scalar("eval loss", loss) # self._eval_loss = eval_loss # Properly initialize all variables. tf.global_variables_initializer().run() self.saver = tf.train.Saver()
def init(vocab_path=None): global vocab, vocab_size if vocab is None: if vocab_path is None: vocab_path = FLAGS.vocab logging.info('vocab:{}'.format(vocab_path)) logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids)) vocab = Vocabulary(vocab_path, FLAGS.num_reserved_ids) vocab_size = vocab.size() if not FLAGS.vocab_size else min( vocab.size(), FLAGS.vocab_size) logging.info('vocab_size:{}'.format(vocab_size)) assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()), vocab.start_id())) logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()), vocab.end_id())) logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()), vocab.unk_id()))
import sys, os import multiprocessing from multiprocessing import Process, Manager, Value import numpy as np import melt import gezi import nowarning #from libsegment import * #need ./data ./conf #Segmentor.Init() segmentor = gezi.Segmentor() from libword_counter import Vocabulary vocabulary = Vocabulary(FLAGS.vocab) import conf from conf import TEXT_MAX_WORDS, IMAGE_FEATURE_LEN texts = [] text_strs = [] manager = Manager() texts_dict = manager.dict() text_strs_dict = manager.dict() gtexts = [[]] * FLAGS.threads gtext_strs = [[]] * FLAGS.threads #how many records generated counter = Value('i', 0)
""" import sys, os import numpy as np import melt import gezi #import libgezi import conf from conf import IMAGE_FEATURE_LEN, TEXT_MAX_WORDS, NUM_RESERVED_IDS, ENCODE_UNK segmentor = gezi.Segmentor() from libword_counter import Vocabulary vocabulary = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS) print('vocab:', FLAGS.vocab, file=sys.stderr) assert vocabulary.size() > NUM_RESERVED_IDS print('vocab size:', vocabulary.size(), file=sys.stderr) writer = None if FLAGS.mode != 2: gezi.try_mkdir(FLAGS.output_directory) outfile = '%s/%s_%s' % (FLAGS.output_directory, FLAGS.name, FLAGS.part) print('outfile:', outfile, file=sys.stderr) writer = melt.tfrecords.Writer(outfile) num = 0
# \Description # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys, os import tensorflow as tf import melt from libword_counter import Vocabulary dir = '/home/gezi/new/temp/makeup/title2name/tfrecord/seq-basic/' vocab = Vocabulary(os.path.join(dir, 'vocab.txt'), 1) embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in') wid = vocab.id('ÂüÐãÀ׶Ø') wid_ = tf.placeholder(dtype=tf.int32, shape=[None, 1]) nids_ = embsim.nearby(wid_) sess = embsim._sess #nids = sess.run(nids_, {wid_: wid}) values, indices = sess.run(nids_, {wid_: [[wid]]}) for index, value in zip(indices[0], values[0]): print(vocab.key(int(index)), value)
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import gezi import libsegment seg = gezi.Segmentor() print('\t'.join(seg.Segment('美女一定要支持'))) print('\x01'.join(seg.Segment('Oh q the same thing to me'))) print('\x01'.join(seg.Segment('Oh q the same thing to me', 'phrase_single'))) print('\x01'.join(seg.Segment('Oh q the same thing to me', 'phrase'))) print('\t'.join(seg.Segment('绿鹭'))) print('\t'.join(seg.segment('绿鹭'))) print('\t'.join(seg.segment_phrase('绿鹭'))) print('\t'.join(gezi.seg.Segment('绿鹭', libsegment.SEG_NEWWORD))) print('\t'.join(gezi.seg.Segment('绿鹭'))) print('|'.join(gezi.segment_char('a baby is looking at 我的小伙伴oh 我不no no没关系 是不是 tian, that not '))) from libword_counter import Vocabulary v = Vocabulary('/home/gezi/temp/textsum/tfrecord/seq-basic.10w/train/vocab.txt', 2) print(v.id('美女')) print(v.key(v.id('美女')))
def build_graph(self): """Build the model graph.""" opts = self._options self.vocab = Vocabulary(opts.vocab_path, 1) #num resevered ids is 1, <PAD> index 0 opts.vocab_size = self.vocab.size() opts.vocab_counts = [ int(self.vocab.freq(i)) for i in xrange(self.vocab.size()) ] print("Data file: ", opts.train_data) print("Vocab size: ", self.vocab.size()) # The training data. A text file. (words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram_word2vec(filename=opts.train_data, vocab_count=opts.vocab_counts, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) opts.words_per_epoch = self._session.run(words_per_epoch) print("Words per epoch: ", opts.words_per_epoch) # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") tf.add_to_collection('word_embedding', w_in) # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train_word2vec( w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts, num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.global_step = global_step self._epoch = current_epoch self._words = total_words_processed
#python predict.py text_file model_dir vocab #sys.stdin will be image feature file import numpy as np from libword_counter import Vocabulary import gezi from deepiu.image_caption.algos import algos_factory WORDS_SEP = ' ' TEXT_MAX_WORDS = 80 NUM_RESERVED_IDS = 1 ENCODE_UNK = 0 IMAGE_FEATURE_LEN = 1000 vocabulary = Vocabulary(sys.argv[3], NUM_RESERVED_IDS) algo = 'bow' predictor = algos_factory.gen_predictor(algo) predictor.init_predict(TEXT_MAX_WORDS) predictor.load(sys.argv[2]) ids_list = [] for line in open(sys.argv[1]): line = line.strip().split('\t')[-1] words = line.split() ids = [ vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK ] ids = gezi.pad(ids, TEXT_MAX_WORDS)