def predict(predictor, image_path): timer = gezi.Timer() image_feature = image_model.process_one_image(image_path) text, score = predictor.inference( ['text', 'text_score'], feed_dict={ 'show_and_tell/model_init_1/image_feature:0': image_feature }) for result in text: print(result, text2ids.ids2text(result), 'decode time(ms):', timer.elapsed_ms()) timer = gezi.Timer() texts, scores = predictor.inference( ['beam_text', 'beam_text_score'], feed_dict={ 'show_and_tell/model_init_1/image_feature:0': image_feature }) texts = texts[0] scores = scores[0] for text, score in zip(texts, scores): print(text, text2ids.ids2text(text), score) print('beam_search using time(ms):', timer.elapsed_ms())
def predict(predictor, input_text): word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS) print('word_ids', word_ids, 'len:', len(word_ids)) print(text2ids.ids2text(word_ids)) #tf.while_loop has debug problem ValueError: Causality violated in timing relations of debug dumps: seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/Merge_7 (1489649052260629): these input(s) are not satisfied: [(u'seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/Enter_7', 0), (u'seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/NextIteration_7', 0) #https://github.com/tensorflow/tensorflow/issues/8337 From your error message, it appears that you are using tf.while_loop. Can you try setting its paralle_iterations parameter to 1 and see if the error still happens? #There may be a bug in how tfdbg handles while_loops with parallel_iterations > 1. #I think it might be a GPU thing. #The example below errors if run as python tf_8337_minimal.py but is fine is run as CUDA_VISIBLE_DEVICES=-1 timer = gezi.Timer() text, score = predictor.inference(['text', 'text_score'], feed_dict= { 'seq2seq/model_init_1/input_text:0': [word_ids] }) for result in text: print(result, text2ids.ids2text(result), 'decode time(ms):', timer.elapsed_ms()) timer = gezi.Timer() texts, scores = predictor.inference(['beam_text', 'beam_text_score'], feed_dict= { 'seq2seq/model_init_1/input_text:0': [word_ids] }) texts = texts[0] scores = scores[0] for text, score in zip(texts, scores): print(text, text2ids.ids2text(text), score) print('beam_search using time(ms):', timer.elapsed_ms())
def predict(predictor, input_text): word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS) print('word_ids', word_ids, 'len:', len(word_ids)) print(text2ids.ids2text(word_ids)) timer = gezi.Timer() initial_state, ids, logprobs = predictor.inference([ 'beam_search_initial_state', 'beam_search_initial_ids', 'beam_search_initial_logprobs' ], feed_dict= { tf.get_collection('input_text_feed')[0] : [word_ids] }) print('inital_state_shape', np.shape(initial_state)) #[1, beam_size] ids = ids[0] logprobs = logprobs[0] print(ids, text2ids.ids2text(ids)) print('logprob', logprobs) print('prob', [math.exp(x) for x in logprobs]) print('inital_state', initial_state[0]) print('first step using time(ms):', timer.elapsed_ms()) timer = gezi.Timer() input_feed = np.array(ids) state_feed = np.array([initial_state[0]] * len(ids)) print('input_feed_shape', np.shape(input_feed)) print('state_feed_shape', np.shape(state_feed)) #state_feed = np.array(initial_state) state, ids, logprobs = predictor.inference([ 'beam_search_state', 'beam_search_ids', 'beam_search_logprobs' ], feed_dict= { tf.get_collection('beam_search_input_feed')[0] : input_feed, tf.get_collection('beam_search_state_feed')[0] : state_feed }) #print(state) print(ids) print(logprobs) ids = ids[0] logprobs = logprobs[0] print(ids, text2ids.ids2text(ids)) print('logprob', logprobs) print('prob', [math.exp(x) for x in logprobs]) print('state', state[0]) print('second step using time(ms):', timer.elapsed_ms())
def load_constant(data_npy, sess=None, trainable=False, dtype=None, shape=None, name=None): """ tf.constant only can be used for small data so melt.constant means melt.large_constant and have more general usage https://stackoverflow.com/questions/35687678/using-a-pre-trained-word-embedding-word2vec-or-glove-in-tensorflow """ name=name or 'constant_data' if not hasattr(load_constant, 'constants'): load_constant.constants = {} if name in load_constant.constants: return load_constant.constants[name] #or if isinstance(data_npy, str) if type(data_npy) is str: timer = gezi.Timer('np load %s' % data_npy) data_npy = np.load(data_npy) timer.print_elapsed() if dtype is None: dtype = npdtype2tfdtype(data_npy) #dtype = tf.float32 if shape is None: shape = data_npy.shape # BELOW is ok but since not add to collections in tf_train_flow will not save.., if add to collections=[tf.GraphKeys.GLOBAL_VARIABLES] then sess.run(init_op) still need to feed # data_init = tf.placeholder(dtype, shape) # #data = tf.get_variable(name=name, dtype=dtype, initializer=data_init, trainable=trainable, collections=[tf.GraphKeys.GLOBAL_VARIABLES]) # data = tf.get_variable(name=name, dtype=dtype, initializer=data_init, trainable=trainable, collections=[]) # load_constant.constants[name] = data # if sess is None: # sess = melt.get_session() # timer = gezi.Timer('sess run initializer') # sess.run(data.initializer, feed_dict={data_init: data_npy}) # timer.print_elapsed() # return data # TODO below is slow strage, some times not slow.., but should use below and above is just a ungly workaround.. and it has problem not save emb.. so just use below... # NOTICE in tf_train_flow sess.run(init_op) will run this again, slow again! TODO better handel timer = gezi.Timer('constant_initializer') data = tf.get_variable(name, shape=shape, initializer=tf.constant_initializer(data_npy), trainable=trainable) load_constant.constants[name] = data timer.print_elapsed() return data
def evaluate_score(): text_max_words = evaluator.all_distinct_texts.shape[1] print('text_max_words:', text_max_words) with tf.variable_scope(FLAGS.algo2): predictor2 = algos.algos_factory.gen_predictor(FLAGS.algo2) predictor2.init_predict(text_max_words) predictor2.load(FLAGS.model2_dir) with tf.variable_scope(FLAGS.algo): predictor = algos.algos_factory.gen_predictor(FLAGS.algo) predictor.init_predict(text_max_words) predictor.load(FLAGS.model_dir) timer = gezi.Timer() start = 0 while start < FLAGS.num_examples: end = start + FLAGS.batch_size if end > FLAGS.num_examples: end = FLAGS.num_examples print('predicts start:', start, 'end:', end, file=sys.stderr) predicts(predictor, start, end) start = end print('using time:', timer.elapsed()) hit_ratio = hit / FLAGS.num_examples total_hit_ratio = total_hit / (FLAGS.num_examples * FLAGS.topn) print('num_hits:', hit) print('num_total_hits:', total_hit) print('hit_ratio:', hit_ratio) print('total_hit_ratio:', total_hit_ratio)
def run(input, count=1): global df, context_tokens_list df = pd.read_csv(input) #df = df[:100] context_tokens_list = man.list([None] * len(df['comment_text'])) timer = gezi.Timer('tokenize') pool = mp.Pool() pool.map(tokenize, range(FLAGS.threads)) pool.close() pool.join() timer.print_elapsed() # for context in tqdm(df['comment_text']): #context_tokens, _ = tokenizer.tokenize(context) #context_tokens = gezi.segment.tokenize_filter_empty(context) for context_tokens in context_tokens_list: counter.add(START_WORD, count) # tokens in one comment treat as 1 for token in set(context_tokens): counter.add(token, count) for ch in token: char_counter.add(ch, count) counter.add(END_WORD, count)
def run(): m = {} files = glob.glob(FLAGS.image_feature_pattern) for file in files: for line in open(file): l = line.strip().split('\t') m[l[0]] = l[-1] for i, line in enumerate(open(FLAGS.image_file)): image = line.strip() if image not in m: print('image not find in ', FLAGS.image_feature_pattern) exit(0) image_feature = m[image].split('\x01') image_feature = [float(x) for x in image_feature] timer = gezi.Timer() word_ids_list = np.load(FLAGS.all_texts) all_text_strs = np.load(FLAGS.all_text_strs) scores = predicts([image_feature], word_ids_list) print(img_html.format(image)) topn = 50 indexes = (-scores).argsort()[:topn] for i, index in enumerate(indexes): print(i, all_text_strs[index], scores[index]) print('<br>') print(i, image, timer.elapsed(), file=sys.stderr)
def run(): m = {} files = glob.glob(FLAGS.image_feature_pattern) for file in files: for line in open(file): l = line.strip().split('\t') m[l[0]] = l[-1] for i, line in enumerate(open(FLAGS.image_file)): image = line.strip() if image not in m: continue image_feature = m[image].split('\x01') image_feature = [float(x) for x in image_feature] timer = gezi.Timer() word_ids_list = np.load(FLAGS.all_texts) all_text_strs = np.load(FLAGS.all_text_strs) scores = predicts([image_feature], word_ids_list) print(img_html.format(image)) vocab = text2ids.vocab topn = 50 indexes = (-scores).argsort()[:topn] j = 0 for i, index in enumerate(indexes): if index > 20000: continue if vocab.key(int(index)) == '±ûÏ©Ëá': continue print(j, vocab.key(int(index)), scores[index]) print('<br>') j += 1 print(i, image, timer.elapsed(), file=sys.stderr)
def gen_input(self, train_only=False): timer = gezi.Timer('gen input') assert not (FLAGS.feed_dict and FLAGS.dynamic_batch_length), \ 'if use feed dict then must use fixed batch length, or use buket mode(@TODO)' input_results = {} input_name_list = [self.input_train_name, \ self.input_valid_name, \ self.fixed_input_valid_name] for name in input_name_list: input_results[name] = None inputs, decode = input.get_decodes(FLAGS.shuffle_then_decode, FLAGS.dynamic_batch_length) input_results[self.input_train_name], trainset = self.gen_train_input(inputs, decode) if not train_only: #---------------------- valid train_with_validation = bool(FLAGS.valid_input) self.train_with_validation = train_with_validation print('train_with_validation:', train_with_validation) if train_with_validation: input_results[self.input_valid_name], \ input_results[self.fixed_input_valid_name], \ eval_batch_size = self.gen_valid_input(inputs, decode) print_input_results(input_results) timer.print() return input_results
def load_graph(self, frozen_graph_file, frozen_graph_name='prefix', frozen_map_file=None): # We load the protobuf file from the disk and parse it to retrieve the # unserialized graph_def timer = gezi.Timer('load frozen graph from %s with mapfile %s' % (frozen_graph_file, frozen_map_file)) with tf.gfile.GFile(frozen_graph_file, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) # Then, we can use again a convenient built-in function to import a graph_def into the # current default Graph with self.sess.graph.as_default() as graph: tf.import_graph_def( graph_def, input_map=None, return_elements=None, name=frozen_graph_name, #op_dict=None, producer_op_list=None ) if frozen_map_file is not None and os.path.exists(frozen_map_file): for line in open(frozen_map_file): cname, key = line.strip().split('\t') if not (key.endswith(':0') or key.endswith(':1') or key.endswith(':2')): key = '%s:0' % key tensor = graph.get_tensor_by_name('%s/%s' % (frozen_graph_name, key)) graph.add_to_collection(cname, tensor) timer.print_elapsed() return graph
def restore_fn(sess): timer = gezi.Timer('restore image var from %s %s' % (image_model_name, image_checkpoint_file)) logging.info("Restoring image variables from checkpoint file %s", image_checkpoint_file) saver.restore(sess, image_checkpoint_file) timer.print()
def predicts(imgs, img_features, predictor, rank_metrics): timer = gezi.Timer('preidctor.bulk_predict') # TODO gpu outofmem predict for showandtell# random = True need_shuffle = False if FLAGS.max_texts > 0 and len(all_distinct_texts) > FLAGS.max_texts: if not random: texts = all_distinct_texts[:FLAGS.max_texts] else: need_shuffle = True index = np.random.choice(len(all_distinct_texts), FLAGS.max_texts, replace=False) texts = all_distinct_texts[index] else: texts = all_distinct_texts step = len(texts) if FLAGS.metric_eval_texts_size > 0 and FLAGS.metric_eval_texts_size < step: step = FLAGS.metric_eval_texts_size start = 0 scores = [] while start < len(texts): end = start + step if end > len(texts): end = len(texts) print('predicts texts start:', start, 'end:', end, end='\r', file=sys.stderr) score = predictor.bulk_predict(img_features, texts[start:end]) scores.append(score) start = end score = np.concatenate(scores, 1) print('image_feature_shape:', img_features.shape, 'text_feature_shape:', texts.shape, 'score_shape:', score.shape) timer.print() img2text = get_bidrectional_lable_map() num_texts = texts.shape[0] for i, img in enumerate(imgs): indexes = (-score[i]).argsort() hits = img2text[img] #notice only work for recall@ or precision@ not work for ndcg@, if ndcg@ must use all #num_positions = min(num_texts, FLAGS.metric_topn) num_positions = num_texts if not need_shuffle: labels = [indexes[j] in hits for j in xrange(num_positions)] else: labels = [index[indexes[j]] in hits for j in xrange(num_positions)] rank_metrics.add(labels)
def init_spacy_full(): import spacy global full_nlp if full_nlp is None: timer = gezi.Timer('load spacy model') full_nlp = spacy.load( '/usr/local/lib/python3.5/dist-packages/spacy/data/en_core_web_md-2.0.0/' ) timer.print_elapsed()
def get_or_restore_embedding(name='emb', embedding_file=None, trainable=None, height=None, emb_dim=None, type='word'): # cpu for adgrad optimizer #if (not FLAGS.word_embedding_file) or glob.glob(FLAGS.model_dir + '/model*ckpt*'): # logging.info('Word embedding random init or from model_dir:{} and trainable=:{}'.format( # FLAGS.model_dir, FLAGS.finetune_word_embedding)) #TODO verify below is ok , above is ok but a bit complex. I assume if var in check point will later restore and cover initital const value #if not FLAGS.word_embedding_file: embedding_file_ = None train_able_ = None if type == 'word': embedding_file_ = FLAGS.word_embedding_file train_able_ = FLAGS.finetune_word_embedding elif type == 'char': embedding_file_ = FLAGS.char_embedding_file train_able_ = FLAGS.finetune_char_embedding elif type == 'ngram': embedding_file_ = FLAGS.ngram_embedding_file train_able_ = FLAGS.finetune_ngram_embedding elif type == 'pinyin': embedding_file_ = FLAGS.pinyin_embedding_file train_able_ = FLAGS.finetune_pinyin_embedding else: raise ValueError(type) embedding_file = embedding_file if embedding_file is not None else embedding_file_ trainable = trainable if trainable is not None else train_able_ #logging.info('----------------------', type, embedding_file, height) if (not embedding_file) or melt.exists_model(FLAGS.model_dir): logging.info( '{} random init or from model_dir and trainable=:{}'.format( name, trainable)) emb = get_embedding(name=name, trainable=trainable, height=height, emb_dim=emb_dim) #melt.try_add_to_collection('word_embedding', emb) else: # https://github.com/tensorflow/tensorflow/issues/1570 # still adgrad must cpu.. # if not fintue emb this will be ok if fintune restart will ok ? must not use word embedding file? os.path.exists(FLAGS.model_dir) ? judge? # or will still try to load from check point ? TODO for safe you could re run by setting word_embedding_file as None or '' logging.info('Loading {} from:{} and trainable=:{}'.format( name, embedding_file, trainable)) timer = gezi.Timer('load constat') emb = melt.load_constant(embedding_file, name=name, trainable=trainable) timer.print_elapsed() return emb
def predicts(predictor, input_texts, texts): input_word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts] word_ids_list = [_text2ids(text, INPUT_TEXT_MAX_WORDS) for text in texts] print(input_word_ids_list) print(word_ids_list) timer = gezi.Timer() score = predictor.inference(['score'], feed_dict= { FLAGS.input_text_name: input_word_ids_list, FLAGS.text_name: word_ids_list }) print('score:', score) print('calc score time(ms):', timer.elapsed_ms()) #TODO FIXME not work... Incompatible shapes: [8] vs. [2,4] timer = gezi.Timer() exact_score = predictor.inference(['exact_score'], feed_dict= { FLAGS.input_text_name: input_word_ids_list, FLAGS.text_name: word_ids_list }) print('exact_score:', exact_score) print('calc score time(ms):', timer.elapsed_ms()) timer = gezi.Timer() exact_prob, logprobs = predictor.inference(['exact_prob', 'seq2seq_logprobs'], feed_dict= { FLAGS.input_text_name: input_word_ids_list, FLAGS.text_name: word_ids_list }) print(exact_prob) print(logprobs) #print('exact_prob:', exact_prob, 'ecact_logprob:', math.log(exact_prob)) #print('logprobs:', logprobs) #print('sum_logprobs:', gezi.gen_sum_list(logprobs)) print('calc prob time(ms):', timer.elapsed_ms())
def predicts(predictor, input_texts, texts): input_word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts] word_ids_list = [_text2ids(text, INPUT_TEXT_MAX_WORDS) for text in texts] #print(input_word_ids_list) #print(word_ids_list) timer = gezi.Timer() print(tf.get_collection('score')) score = predictor.inference('score', feed_dict= { tf.get_collection('lfeed')[-1]: input_word_ids_list, tf.get_collection('rfeed')[-1]: word_ids_list }) print('score:', score) print('calc score time(ms):', timer.elapsed_ms()) timer = gezi.Timer() exact_score = predictor.inference('exact_score', feed_dict= { tf.get_collection('lfeed')[-1]: input_word_ids_list, tf.get_collection('rfeed')[-1]: word_ids_list }) print('exact_score:', exact_score) print('calc score time(ms):', timer.elapsed_ms()) timer = gezi.Timer() exact_prob, logprobs = predictor.inference(['exact_prob', 'seq2seq_logprobs'], feed_dict= { tf.get_collection('lfeed')[-1]: input_word_ids_list, tf.get_collection('rfeed')[-1]: word_ids_list }) print(exact_prob) print(logprobs) #print('exact_prob:', exact_prob, 'ecact_logprob:', math.log(exact_prob)) #print('logprobs:', logprobs) #print('sum_logprobs:', gezi.gen_sum_list(logprobs)) print('calc prob time(ms):', timer.elapsed_ms())
def predict(predictor, input_text, text): input_word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS) print('input_word_ids', input_word_ids, 'len:', len(input_word_ids)) print(text2ids.ids2text(input_word_ids)) word_ids = _text2ids(text, INPUT_TEXT_MAX_WORDS) print('word_ids', word_ids, 'len:', len(word_ids)) print(text2ids.ids2text(word_ids)) timer = gezi.Timer() score = predictor.inference(['score'], feed_dict= { FLAGS.input_text_name: [input_word_ids], FLAGS.text_name: [word_ids] }) print('score:', score) print('calc score time(ms):', timer.elapsed_ms()) timer = gezi.Timer() exact_score = predictor.inference(['exact_score'], feed_dict= { FLAGS.input_text_name: [input_word_ids], FLAGS.text_name: [word_ids] }) print('exact_score:', exact_score) print('calc score time(ms):', timer.elapsed_ms()) timer = gezi.Timer() exact_prob, logprobs = predictor.inference(['exact_prob', 'seq2seq_logprobs'], feed_dict= { FLAGS.input_text_name: [input_word_ids], FLAGS.text_name: [word_ids] }) exact_prob = exact_prob[0] logprobs = logprobs[0] print('exact_prob:', exact_prob, 'ecact_logprob:', math.log(exact_prob)) print('logprobs:', logprobs) print('sum_logprobs:', gezi.gen_sum_list(logprobs)) print('calc prob time(ms):', timer.elapsed_ms())
def load(self, model_dir, var_list=None, model_name=None, sess = None): """ only load varaibels from checkpoint file, you need to create the graph before calling load """ if sess is not None: self.sess = sess self.model_path = melt.get_model_path(model_dir, model_name) timer = gezi.Timer('load model ok %s' % self.model_path) saver = melt.restore_from_path(self.sess, self.model_path, var_list) timer.print() return self.sess
def get_image_names_and_features(): global image_names, image_features if image_names is None: image_feature_bin = os.path.join(FLAGS.valid_resource_dir, 'distinct_image_features.npy') image_name_bin = os.path.join(FLAGS.valid_resource_dir, 'distinct_image_names.npy') timer = gezi.Timer('get_image_names_and_features') image_names = np.load(image_name_bin) image_features = np.load(image_feature_bin) image_features = hack_image_features(image_features) print('all_distinct_images len:', len(image_features), file=sys.stderr) timer.print() return image_names, image_features
def main(_): base = FLAGS.base logging.set_logging_path('./mount/tmp/') vocab_path = os.path.join(os.path.dirname(os.path.dirname(FLAGS.input)), 'vocab.txt') ids2text.init(vocab_path) FLAGS.vocab = f'{base}/vocab.txt' tf.set_random_seed(FLAGS.random_seed) # FLAGS.length_index = 2 # FLAGS.buckets = '100,400' # FLAGS.batch_sizes = '64,64,32' input_ = FLAGS.input if FLAGS.type == 'test': input_ = input_.replace('valid', 'test') inputs = gezi.list_files(input_) inputs.sort() if FLAGS.fold is not None: inputs = [ x for x in inputs if not x.endswith('%d.record' % FLAGS.fold) ] print('type', FLAGS.type, 'inputs', inputs, file=sys.stderr) #dataset = Dataset('valid') dataset = Dataset('train') # balance pos neg tested ok dataset = dataset.make_batch(FLAGS.batch_size_, inputs, repeat=False) print('dataset', dataset) ids = [] timer = gezi.Timer('read record') for i, (x, y) in enumerate(dataset): #if i % 10 == 1: # print(x['passage'][0]) # print(ids2text.ids2text(x['passage'][0], sep='|')) # print(ids2text.ids2text(x['candidate_pos'][0], sep='|')) # print(ids2text.ids2text(x['candidate_neg'][0], sep='|')) # print(x['passage']) # print(x['candidate_pos']) # print(type(x['id'].numpy()[0]) == bytes) # break for id in x['id'].numpy(): ids.append(id) print(i, x['type'].numpy()) print(len(ids), len(set(ids)))
def predicts(predictor, input_texts): word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts] timer = gezi.Timer() texts_list, scores_list = predictor.inference(['beam_text', 'beam_text_score'], feed_dict= { tf.get_collection('lfeed')[-1]: word_ids_list }) for texts, scores in zip(texts_list, scores_list): for text, score in zip(texts, scores): print(text, text2ids.ids2text(text), score, math.log(score)) print('beam_search using time(ms):', timer.elapsed_ms())
def run(): predictor = melt.Predictor(FLAGS.model_dir) logging.info('model:%s'%predictor.model_path) start = 0 timer = gezi.Timer() while start < FLAGS.num_images: end = start + FLAGS.batch_size end = min(FLAGS.num_images, end) print('predicts start:', start, 'end:', end, file=sys.stderr) predicts(predictor, start, end) start = end print('time:', timer.elapsed())
def main(_): text2ids.init() global_scope = '' if FLAGS.add_global_scope: global_scope = FLAGS.global_scope if FLAGS.global_scope else FLAGS.algo global sess sess = melt.get_session(log_device_placement=FLAGS.log_device_placement) with tf.variable_scope(global_scope): predictor = algos_factory.gen_predictor(FLAGS.algo) with tf.variable_scope(FLAGS.main_scope) as scope: text, score, beam_text, beam_score = gen_predict_graph(predictor, scope) predictor.load(FLAGS.model_dir) #input_text = "������������_��������ǰ��Ա���Ƭ" input_texts = ['���������һ�Ը�Ů�ڿ�����ջ�����˿¶�δ���������ڿ�Ů��-�Ա���', '����̫����ô����', '����������ʵ��С��ô��,����������ʵ��С���δ�ʩ', '����ף�Ŀǰ4����1�굶'] for input_text in input_texts: word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS) print(word_ids) print(text2ids.ids2text(word_ids)) timer = gezi.Timer() text_, score_ = sess.run([text, score], {predictor.input_text_place : [word_ids]}) print(text_[0], text2ids.ids2text(text_[0]), score_[0], 'time(ms):', timer.elapsed_ms()) timer = gezi.Timer() texts, scores = sess.run([beam_text, beam_score], {predictor.input_text_place : [word_ids]}) texts = texts[0] scores = scores[0] for text_, score_ in zip(texts, scores): print(text_, text2ids.ids2text(text_), score_) print('beam_search using time(ms):', timer.elapsed_ms())
def predict(predictor, input_text, text): input_word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS) #print('input_word_ids', input_word_ids, 'len:', len(input_word_ids)) #print(text2ids.ids2text(input_word_ids)) word_ids = _text2ids(text, INPUT_TEXT_MAX_WORDS) #print('word_ids', word_ids, 'len:', len(word_ids)) #print(text2ids.ids2text(word_ids)) timer = gezi.Timer() score = predictor.inference('score', feed_dict= { tf.get_collection('lfeed')[-1]: [input_word_ids], tf.get_collection('rfeed')[-1]: [word_ids] }) print('score:', score) print('calc score time(ms):', timer.elapsed_ms()) timer = gezi.Timer() exact_score = predictor.inference('exact_score', feed_dict= { tf.get_collection('lfeed')[-1]: [input_word_ids], tf.get_collection('rfeed')[-1]: [word_ids] }) print('exact_score:', exact_score) print('calc score time(ms):', timer.elapsed_ms()) timer = gezi.Timer() exact_prob = predictor.inference('exact_prob', feed_dict= { tf.get_collection('lfeed')[-1]: [input_word_ids], tf.get_collection('rfeed')[-1]: [word_ids] }) print('exact_prob:', exact_prob) print('calc score time(ms):', timer.elapsed_ms())
def get_image_names_and_features(): global image_names, image_features if image_names is None: timer = gezi.Timer('get_image_names_and_features') if FLAGS.image_name_bin and FLAGS.image_feature_bin: image_names = np.load(FLAGS.image_name_bin) image_features = np.load(FLAGS.image_feature_bin) else: lines = open(FLAGS.image_feature_file).readlines() image_names = np.array([line.split('\t')[0] for line in lines]) image_features = np.array( [[float(x) for x in line.split('\t')[1:1 + IMAGE_FEATURE_LEN]] for line in lines]) timer.print() return image_names, image_features
def predict(predictor, input_text): word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS) print('word_ids', word_ids, 'len:', len(word_ids)) print(text2ids.ids2text(word_ids)) timer = gezi.Timer() #print(tf.get_collection('beam_search_initial_alignments')) #print(tf.get_collection('beam_search_alignments')) init_states = predictor.inference([ 'beam_search_beam_size', 'beam_search_initial_state', 'beam_search_initial_ids', 'beam_search_initial_logprobs', 'beam_search_initial_alignments' ], feed_dict= { tf.get_collection('input_text_feed')[0] : [word_ids] }) step_func = lambda input_feed, state_feed : predictor.inference([ 'beam_search_state', 'beam_search_ids', 'beam_search_logprobs', #'attention_alignments' 'beam_search_alignments', #must use this ], feed_dict= { #TODO...attetion still need input_text feed, see rnn_decoder.py beam_search_step #but not hurt perfomance much because encoder is fast? Is it possible to avoid this? #anyway if no attention will not need input_text_feed tf.get_collection('input_text_feed')[0] : [word_ids], tf.get_collection('beam_search_input_feed')[0] : input_feed, tf.get_collection('beam_search_state_feed')[0] : state_feed }) max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS beams = melt.seq2seq.beam_search(init_states, step_func, end_id=text2ids.end_id(), max_words=max_words, length_normalization_factor=0.) for i, beam in enumerate(beams): print(i, beam.words, text2ids.ids2text(beam.words), math.exp(beam.logprob), beam.logprob, beam.score, beam.logprobs) print(beam.alignments_list) print('beam search using time(ms):', timer.elapsed_ms())
def gen_input(self, train_only=False): timer = gezi.Timer('gen input') assert not (FLAGS.feed_dict and FLAGS.dynamic_batch_length), \ 'if use feed dict then must use fixed batch length, or use buket mode(@TODO)' input_results = {} input_name_list = [self.input_train_name, self.input_train_neg_name, \ self.input_valid_name, self.fixed_input_valid_name, \ self.input_valid_neg_name] for name in input_name_list: input_results[name] = None assert FLAGS.shuffle_then_decode, "since use sparse data for text, must shuffle then decode" inputs, decode_fn, decode_neg_fn = \ input.get_decodes(use_neg=(FLAGS.num_negs > 0)) input_results[self.input_train_name], trainset = self.gen_train_input( inputs, decode_fn) if decode_neg_fn is not None: input_results[ self.input_train_neg_name] = self.gen_train_neg_input( inputs, decode_neg_fn, trainset) if not train_only: #---------------------- valid train_with_validation = bool(FLAGS.valid_input) self.train_with_validation = train_with_validation print('train_with_validation:', train_with_validation) if train_with_validation: input_results[self.input_valid_name], \ input_results[self.fixed_input_valid_name], \ eval_batch_size = self.gen_valid_input(inputs, decode_fn) if decode_neg_fn is not None: input_results[ self.input_valid_neg_name] = self.gen_valid_neg_input( inputs, decode_neg_fn, trainset, eval_batch_size) print_input_results(input_results) timer.print() return input_results
def doc(text): import spacy global full_nlp if full_nlp is None: # TODO FIXME #full_nlp = spacy.load("en") timer = gezi.Timer('load spacy model') full_nlp = spacy.load( '/usr/local/lib/python3.5/dist-packages/spacy/data/en_core_web_md-2.0.0/' ) timer.print_elapsed() if six.PY2: text = text.decode('utf-8') doc = full_nlp(text) return doc
def evaluate_score(): text_max_words = evaluator.all_distinct_texts.shape[1] print('text_max_words:', text_max_words) predictor = melt.Predictor(FLAGS.model_dir) timer = gezi.Timer() start = 0 while start < FLAGS.num_examples: end = start + FLAGS.batch_size if end > FLAGS.num_examples: end = FLAGS.num_examples print('predicts start:', start, 'end:', end, file=sys.stderr) predicts(predictor, start, end) start = end melt.print_results(rank_metrics.get_metrics(), rank_metrics.get_names()) print('predict using time:', timer.elapsed())
def predicts(predictor, input_texts): word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts] timer = gezi.Timer() texts_list, scores_list = predictor.inference(['beam_text', 'beam_text_score'], feed_dict= { 'seq2seq/model_init_1/input_text:0': word_ids_list }) for texts, scores in zip(texts_list, scores_list): for text, score in zip(texts, scores): if text[0] == text2ids.vocab.unk_id(): continue print(''.join(text2ids.ids2words(text, print_end=False))) break print('beam_search using time(ms):', timer.elapsed_ms())