Пример #1
0
def predict(predictor, image_path):
    timer = gezi.Timer()
    image_feature = image_model.process_one_image(image_path)
    text, score = predictor.inference(
        ['text', 'text_score'],
        feed_dict={
            'show_and_tell/model_init_1/image_feature:0': image_feature
        })

    for result in text:
        print(result, text2ids.ids2text(result), 'decode time(ms):',
              timer.elapsed_ms())

    timer = gezi.Timer()
    texts, scores = predictor.inference(
        ['beam_text', 'beam_text_score'],
        feed_dict={
            'show_and_tell/model_init_1/image_feature:0': image_feature
        })

    texts = texts[0]
    scores = scores[0]
    for text, score in zip(texts, scores):
        print(text, text2ids.ids2text(text), score)

    print('beam_search using time(ms):', timer.elapsed_ms())
Пример #2
0
def predict(predictor, input_text):
  word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  #tf.while_loop has debug problem ValueError: Causality violated in timing relations of debug dumps: seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/Merge_7 (1489649052260629): these input(s) are not satisfied: [(u'seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/Enter_7', 0), (u'seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/NextIteration_7', 0)
  #https://github.com/tensorflow/tensorflow/issues/8337 From your error message, it appears that you are using tf.while_loop. Can you try setting its paralle_iterations parameter to 1 and see if the error still happens?
  #There may be a bug in how tfdbg handles while_loops with parallel_iterations > 1.
  #I think it might be a GPU thing.
  #The example below errors if run as python tf_8337_minimal.py but is fine is run as CUDA_VISIBLE_DEVICES=-1 
  timer = gezi.Timer()
  text, score = predictor.inference(['text', 'text_score'], 
                                    feed_dict= {
                                      'seq2seq/model_init_1/input_text:0': [word_ids]
                                      })
  
  for result in text:
    print(result, text2ids.ids2text(result), 'decode time(ms):', timer.elapsed_ms())
  
  timer = gezi.Timer()
  texts, scores = predictor.inference(['beam_text', 'beam_text_score'], 
                                    feed_dict= {
                                      'seq2seq/model_init_1/input_text:0': [word_ids]
                                      })

  texts = texts[0]
  scores = scores[0]
  for text, score in zip(texts, scores):
    print(text, text2ids.ids2text(text), score)

  print('beam_search using time(ms):', timer.elapsed_ms())
Пример #3
0
def predict(predictor, input_text):
  word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  timer = gezi.Timer()
  initial_state, ids, logprobs = predictor.inference([
                                        'beam_search_initial_state', 
                                        'beam_search_initial_ids', 
                                        'beam_search_initial_logprobs'
                                        ], 
                                        feed_dict= {
                                          tf.get_collection('input_text_feed')[0] : [word_ids]
                                        })

  print('inital_state_shape', np.shape(initial_state))
  #[1, beam_size]
  ids = ids[0]
  logprobs = logprobs[0]

  print(ids, text2ids.ids2text(ids))
  print('logprob', logprobs)
  print('prob', [math.exp(x) for x in logprobs])
  print('inital_state', initial_state[0])

  print('first step using time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()

  input_feed = np.array(ids)
  state_feed = np.array([initial_state[0]] * len(ids))
  print('input_feed_shape', np.shape(input_feed))
  print('state_feed_shape', np.shape(state_feed))
  #state_feed = np.array(initial_state)

  state, ids, logprobs = predictor.inference([
                                        'beam_search_state', 
                                        'beam_search_ids', 
                                        'beam_search_logprobs'
                                        ], 
                                        feed_dict= {
                                          tf.get_collection('beam_search_input_feed')[0] : input_feed,
                                          tf.get_collection('beam_search_state_feed')[0] : state_feed
                                        })

  #print(state)
  print(ids)
  print(logprobs)

  ids = ids[0]
  logprobs = logprobs[0]

  print(ids, text2ids.ids2text(ids))
  print('logprob', logprobs)
  print('prob', [math.exp(x) for x in logprobs])
  print('state', state[0])

  print('second step using time(ms):', timer.elapsed_ms())
Пример #4
0
def load_constant(data_npy, sess=None, trainable=False, 
                  dtype=None, shape=None, name=None):
  """
  tf.constant only can be used for small data
  so melt.constant means melt.large_constant and have more general usage
  https://stackoverflow.com/questions/35687678/using-a-pre-trained-word-embedding-word2vec-or-glove-in-tensorflow
  """
  name=name or 'constant_data'

  if not hasattr(load_constant, 'constants'):
    load_constant.constants = {}

  if name in load_constant.constants:
    return load_constant.constants[name]

  #or if isinstance(data_npy, str)
  if type(data_npy) is str:
    timer = gezi.Timer('np load %s' % data_npy)
    data_npy = np.load(data_npy)
    timer.print_elapsed()

  if dtype is None:
    dtype = npdtype2tfdtype(data_npy)
  #dtype = tf.float32

  if shape is None:
    shape = data_npy.shape
  
  # BELOW is ok but since not add to collections in tf_train_flow will not save.., if add to collections=[tf.GraphKeys.GLOBAL_VARIABLES] then sess.run(init_op) still need to feed
  # data_init = tf.placeholder(dtype, shape)
  # #data = tf.get_variable(name=name, dtype=dtype, initializer=data_init, trainable=trainable, collections=[tf.GraphKeys.GLOBAL_VARIABLES])
  # data = tf.get_variable(name=name, dtype=dtype, initializer=data_init, trainable=trainable, collections=[])
  # load_constant.constants[name] = data

  # if sess is None:
  #   sess = melt.get_session()
  # timer = gezi.Timer('sess run initializer')
  # sess.run(data.initializer, feed_dict={data_init: data_npy}) 
  # timer.print_elapsed()
  # return data
  
  # TODO below is slow strage, some times not slow.., but should use below and above is just a ungly workaround.. and it has problem not save emb.. so just use below...
  # NOTICE in tf_train_flow sess.run(init_op) will run this again, slow again! TODO better handel
  timer = gezi.Timer('constant_initializer')
  data = tf.get_variable(name, shape=shape, initializer=tf.constant_initializer(data_npy), trainable=trainable)
  load_constant.constants[name] = data
  timer.print_elapsed()
  
  return data
Пример #5
0
def evaluate_score():
    text_max_words = evaluator.all_distinct_texts.shape[1]
    print('text_max_words:', text_max_words)

    with tf.variable_scope(FLAGS.algo2):
        predictor2 = algos.algos_factory.gen_predictor(FLAGS.algo2)
        predictor2.init_predict(text_max_words)
        predictor2.load(FLAGS.model2_dir)

    with tf.variable_scope(FLAGS.algo):
        predictor = algos.algos_factory.gen_predictor(FLAGS.algo)
        predictor.init_predict(text_max_words)
        predictor.load(FLAGS.model_dir)

    timer = gezi.Timer()
    start = 0
    while start < FLAGS.num_examples:
        end = start + FLAGS.batch_size
        if end > FLAGS.num_examples:
            end = FLAGS.num_examples
        print('predicts start:', start, 'end:', end, file=sys.stderr)
        predicts(predictor, start, end)
        start = end

    print('using time:', timer.elapsed())
    hit_ratio = hit / FLAGS.num_examples
    total_hit_ratio = total_hit / (FLAGS.num_examples * FLAGS.topn)
    print('num_hits:', hit)
    print('num_total_hits:', total_hit)
    print('hit_ratio:', hit_ratio)
    print('total_hit_ratio:', total_hit_ratio)
Пример #6
0
def run(input, count=1):
    global df, context_tokens_list
    df = pd.read_csv(input)
    #df = df[:100]
    context_tokens_list = man.list([None] * len(df['comment_text']))

    timer = gezi.Timer('tokenize')

    pool = mp.Pool()
    pool.map(tokenize, range(FLAGS.threads))
    pool.close()
    pool.join()

    timer.print_elapsed()

    # for context in tqdm(df['comment_text']):
    #context_tokens, _ = tokenizer.tokenize(context)
    #context_tokens = gezi.segment.tokenize_filter_empty(context)
    for context_tokens in context_tokens_list:
        counter.add(START_WORD, count)
        # tokens in one comment treat as 1
        for token in set(context_tokens):
            counter.add(token, count)
            for ch in token:
                char_counter.add(ch, count)
        counter.add(END_WORD, count)
def run():
    m = {}
    files = glob.glob(FLAGS.image_feature_pattern)
    for file in files:
        for line in open(file):
            l = line.strip().split('\t')
            m[l[0]] = l[-1]

    for i, line in enumerate(open(FLAGS.image_file)):
        image = line.strip()
        if image not in m:
            print('image not find in ', FLAGS.image_feature_pattern)
            exit(0)
        image_feature = m[image].split('\x01')
        image_feature = [float(x) for x in image_feature]
        timer = gezi.Timer()
        word_ids_list = np.load(FLAGS.all_texts)
        all_text_strs = np.load(FLAGS.all_text_strs)
        scores = predicts([image_feature], word_ids_list)
        print(img_html.format(image))
        topn = 50
        indexes = (-scores).argsort()[:topn]
        for i, index in enumerate(indexes):
            print(i, all_text_strs[index], scores[index])
            print('<br>')

        print(i, image, timer.elapsed(), file=sys.stderr)
def run():
  m = {}
  files = glob.glob(FLAGS.image_feature_pattern)
  for file in files:
    for line in open(file):
      l = line.strip().split('\t')
      m[l[0]] = l[-1]

  for i, line in enumerate(open(FLAGS.image_file)):
    image = line.strip()
    if image not in m:
      continue
    image_feature = m[image].split('\x01')
    image_feature = [float(x) for x in image_feature] 
    timer = gezi.Timer()
    word_ids_list = np.load(FLAGS.all_texts)
    all_text_strs = np.load(FLAGS.all_text_strs)
    scores = predicts([image_feature], word_ids_list)
    print(img_html.format(image))
    vocab = text2ids.vocab
    topn = 50
    indexes = (-scores).argsort()[:topn]
    j = 0
    for i, index in enumerate(indexes):
      if index > 20000:
        continue 
      if vocab.key(int(index)) == '±ûÏ©Ëá':
        continue
      print(j, vocab.key(int(index)), scores[index])
      print('<br>')
      j += 1

    print(i, image, timer.elapsed(), file=sys.stderr)
Пример #9
0
  def gen_input(self, train_only=False):
    timer = gezi.Timer('gen input')

    assert not (FLAGS.feed_dict and FLAGS.dynamic_batch_length), \
          'if use feed dict then must use fixed batch length, or use buket mode(@TODO)'

    input_results = {}

    input_name_list = [self.input_train_name, \
                       self.input_valid_name, \
                       self.fixed_input_valid_name]

    for name in input_name_list:
      input_results[name] = None

    inputs, decode = input.get_decodes(FLAGS.shuffle_then_decode, FLAGS.dynamic_batch_length)

    input_results[self.input_train_name], trainset = self.gen_train_input(inputs, decode)
    
    if not train_only:
      #---------------------- valid
      train_with_validation = bool(FLAGS.valid_input) 
      self.train_with_validation = train_with_validation
      print('train_with_validation:', train_with_validation)
      if train_with_validation:
        input_results[self.input_valid_name], \
        input_results[self.fixed_input_valid_name], \
        eval_batch_size = self.gen_valid_input(inputs, decode)

    print_input_results(input_results)

    timer.print()

    return input_results
Пример #10
0
  def load_graph(self, frozen_graph_file, frozen_graph_name='prefix', frozen_map_file=None):
    # We load the protobuf file from the disk and parse it to retrieve the
    # unserialized graph_def
    timer = gezi.Timer('load frozen graph from %s with mapfile %s' % (frozen_graph_file, frozen_map_file))
    with tf.gfile.GFile(frozen_graph_file, "rb") as f:
      graph_def = tf.GraphDef()
      graph_def.ParseFromString(f.read())

    # Then, we can use again a convenient built-in function to import a graph_def into the
    # current default Graph
    with self.sess.graph.as_default() as graph:
      tf.import_graph_def(
        graph_def,
        input_map=None,
        return_elements=None,
        name=frozen_graph_name,
        #op_dict=None,
        producer_op_list=None
      )

      if frozen_map_file is not None and os.path.exists(frozen_map_file):
        for line in open(frozen_map_file):
          cname, key = line.strip().split('\t')
          if not (key.endswith(':0') or key.endswith(':1') or key.endswith(':2')):
            key = '%s:0' % key
          tensor = graph.get_tensor_by_name('%s/%s' % (frozen_graph_name, key))
          graph.add_to_collection(cname, tensor)

    timer.print_elapsed()
    return graph
Пример #11
0
 def restore_fn(sess):
     timer = gezi.Timer('restore image var from %s %s' %
                        (image_model_name, image_checkpoint_file))
     logging.info("Restoring image variables from checkpoint file %s",
                  image_checkpoint_file)
     saver.restore(sess, image_checkpoint_file)
     timer.print()
Пример #12
0
def predicts(imgs, img_features, predictor, rank_metrics):
    timer = gezi.Timer('preidctor.bulk_predict')
    # TODO gpu outofmem predict for showandtell#

    random = True
    need_shuffle = False
    if FLAGS.max_texts > 0 and len(all_distinct_texts) > FLAGS.max_texts:
        if not random:
            texts = all_distinct_texts[:FLAGS.max_texts]
        else:
            need_shuffle = True
            index = np.random.choice(len(all_distinct_texts),
                                     FLAGS.max_texts,
                                     replace=False)
            texts = all_distinct_texts[index]
    else:
        texts = all_distinct_texts

    step = len(texts)
    if FLAGS.metric_eval_texts_size > 0 and FLAGS.metric_eval_texts_size < step:
        step = FLAGS.metric_eval_texts_size
    start = 0
    scores = []
    while start < len(texts):
        end = start + step
        if end > len(texts):
            end = len(texts)
        print('predicts texts start:',
              start,
              'end:',
              end,
              end='\r',
              file=sys.stderr)
        score = predictor.bulk_predict(img_features, texts[start:end])
        scores.append(score)
        start = end
    score = np.concatenate(scores, 1)
    print('image_feature_shape:', img_features.shape, 'text_feature_shape:',
          texts.shape, 'score_shape:', score.shape)
    timer.print()
    img2text = get_bidrectional_lable_map()
    num_texts = texts.shape[0]

    for i, img in enumerate(imgs):
        indexes = (-score[i]).argsort()

        hits = img2text[img]

        #notice only work for recall@ or precision@ not work for ndcg@, if ndcg@ must use all
        #num_positions = min(num_texts, FLAGS.metric_topn)
        num_positions = num_texts

        if not need_shuffle:
            labels = [indexes[j] in hits for j in xrange(num_positions)]
        else:
            labels = [index[indexes[j]] in hits for j in xrange(num_positions)]

        rank_metrics.add(labels)
Пример #13
0
def init_spacy_full():
    import spacy
    global full_nlp
    if full_nlp is None:
        timer = gezi.Timer('load spacy model')
        full_nlp = spacy.load(
            '/usr/local/lib/python3.5/dist-packages/spacy/data/en_core_web_md-2.0.0/'
        )
        timer.print_elapsed()
Пример #14
0
def get_or_restore_embedding(name='emb',
                             embedding_file=None,
                             trainable=None,
                             height=None,
                             emb_dim=None,
                             type='word'):
    # cpu for adgrad optimizer
    #if (not FLAGS.word_embedding_file) or glob.glob(FLAGS.model_dir + '/model*ckpt*'):
    # logging.info('Word embedding random init or from model_dir:{} and trainable=:{}'.format(
    #     FLAGS.model_dir, FLAGS.finetune_word_embedding))
    #TODO verify below is ok , above is ok but a bit complex. I assume if var in check point will later restore and cover initital const value
    #if not FLAGS.word_embedding_file:
    embedding_file_ = None
    train_able_ = None
    if type == 'word':
        embedding_file_ = FLAGS.word_embedding_file
        train_able_ = FLAGS.finetune_word_embedding
    elif type == 'char':
        embedding_file_ = FLAGS.char_embedding_file
        train_able_ = FLAGS.finetune_char_embedding
    elif type == 'ngram':
        embedding_file_ = FLAGS.ngram_embedding_file
        train_able_ = FLAGS.finetune_ngram_embedding
    elif type == 'pinyin':
        embedding_file_ = FLAGS.pinyin_embedding_file
        train_able_ = FLAGS.finetune_pinyin_embedding
    else:
        raise ValueError(type)

    embedding_file = embedding_file if embedding_file is not None else embedding_file_
    trainable = trainable if trainable is not None else train_able_

    #logging.info('----------------------', type, embedding_file, height)
    if (not embedding_file) or melt.exists_model(FLAGS.model_dir):
        logging.info(
            '{} random init or from model_dir and trainable=:{}'.format(
                name, trainable))
        emb = get_embedding(name=name,
                            trainable=trainable,
                            height=height,
                            emb_dim=emb_dim)
        #melt.try_add_to_collection('word_embedding', emb)
    else:
        # https://github.com/tensorflow/tensorflow/issues/1570
        # still adgrad must cpu..
        # if not fintue emb this will be ok if fintune restart will ok ? must not use word embedding file? os.path.exists(FLAGS.model_dir) ? judge?
        # or will still try to load from check point ? TODO for safe you could re run by setting word_embedding_file as None or ''
        logging.info('Loading {} from:{} and trainable=:{}'.format(
            name, embedding_file, trainable))
        timer = gezi.Timer('load constat')
        emb = melt.load_constant(embedding_file,
                                 name=name,
                                 trainable=trainable)
        timer.print_elapsed()
    return emb
Пример #15
0
def predicts(predictor, input_texts, texts):
  input_word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts]
  word_ids_list = [_text2ids(text, INPUT_TEXT_MAX_WORDS) for text in texts]

  print(input_word_ids_list)
  print(word_ids_list)

  timer = gezi.Timer()
  score = predictor.inference(['score'], 
                              feed_dict= {
                                      FLAGS.input_text_name: input_word_ids_list,
                                      FLAGS.text_name: word_ids_list
                                      })
  
  print('score:', score)
  print('calc score time(ms):', timer.elapsed_ms())

  #TODO FIXME not work...  Incompatible shapes: [8] vs. [2,4]
  timer = gezi.Timer()
  exact_score = predictor.inference(['exact_score'], 
                                    feed_dict= {
                                      FLAGS.input_text_name: input_word_ids_list,
                                      FLAGS.text_name: word_ids_list
                                      })
  
  print('exact_score:', exact_score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()

  exact_prob, logprobs = predictor.inference(['exact_prob', 'seq2seq_logprobs'], 
                                    feed_dict= {
                                      FLAGS.input_text_name: input_word_ids_list,
                                      FLAGS.text_name: word_ids_list
                                      })
  
  print(exact_prob)
  print(logprobs)
  #print('exact_prob:', exact_prob, 'ecact_logprob:', math.log(exact_prob))
  #print('logprobs:', logprobs)
  #print('sum_logprobs:', gezi.gen_sum_list(logprobs))
  print('calc prob time(ms):', timer.elapsed_ms())
Пример #16
0
def predicts(predictor, input_texts, texts):
  input_word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts]
  word_ids_list = [_text2ids(text, INPUT_TEXT_MAX_WORDS) for text in texts]

  #print(input_word_ids_list)
  #print(word_ids_list)

  timer = gezi.Timer()
  print(tf.get_collection('score'))
  score = predictor.inference('score', 
                              feed_dict= {
                                      tf.get_collection('lfeed')[-1]: input_word_ids_list,
                                      tf.get_collection('rfeed')[-1]: word_ids_list
                                      })
  
  print('score:', score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()
  exact_score = predictor.inference('exact_score', 
                                    feed_dict= {
                                      tf.get_collection('lfeed')[-1]: input_word_ids_list,
                                      tf.get_collection('rfeed')[-1]: word_ids_list
                                      })
  
  print('exact_score:', exact_score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()

  exact_prob, logprobs = predictor.inference(['exact_prob', 'seq2seq_logprobs'], 
                                    feed_dict= {
                                      tf.get_collection('lfeed')[-1]: input_word_ids_list,
                                      tf.get_collection('rfeed')[-1]: word_ids_list
                                      })
  
  print(exact_prob)
  print(logprobs)
  #print('exact_prob:', exact_prob, 'ecact_logprob:', math.log(exact_prob))
  #print('logprobs:', logprobs)
  #print('sum_logprobs:', gezi.gen_sum_list(logprobs))
  print('calc prob time(ms):', timer.elapsed_ms())
Пример #17
0
def predict(predictor, input_text, text):
  input_word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('input_word_ids', input_word_ids, 'len:', len(input_word_ids))
  print(text2ids.ids2text(input_word_ids))
  word_ids = _text2ids(text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  timer = gezi.Timer()
  score = predictor.inference(['score'], 
                              feed_dict= {
                                      FLAGS.input_text_name: [input_word_ids],
                                      FLAGS.text_name: [word_ids]
                                      })
  
  print('score:', score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()
  exact_score = predictor.inference(['exact_score'], 
                                    feed_dict= {
                                      FLAGS.input_text_name: [input_word_ids],
                                      FLAGS.text_name: [word_ids]
                                      })
  
  print('exact_score:', exact_score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()
  exact_prob, logprobs = predictor.inference(['exact_prob', 'seq2seq_logprobs'], 
                                    feed_dict= {
                                      FLAGS.input_text_name: [input_word_ids],
                                      FLAGS.text_name: [word_ids]
                                      })
  
  exact_prob = exact_prob[0]
  logprobs = logprobs[0]
  print('exact_prob:', exact_prob, 'ecact_logprob:', math.log(exact_prob))
  print('logprobs:', logprobs)
  print('sum_logprobs:', gezi.gen_sum_list(logprobs))
  print('calc prob time(ms):', timer.elapsed_ms())
 def load(self, model_dir, var_list=None, model_name=None, sess = None):
   """
   only load varaibels from checkpoint file, you need to 
   create the graph before calling load
   """
   if sess is not None:
     self.sess = sess
   self.model_path = melt.get_model_path(model_dir, model_name)
   timer = gezi.Timer('load model ok %s' % self.model_path)
   saver = melt.restore_from_path(self.sess, self.model_path, var_list)
   timer.print()
   return self.sess
Пример #19
0
def get_image_names_and_features():
  global image_names, image_features
  if image_names is None:
    image_feature_bin = os.path.join(FLAGS.valid_resource_dir, 'distinct_image_features.npy')
    image_name_bin = os.path.join(FLAGS.valid_resource_dir, 'distinct_image_names.npy')
    timer = gezi.Timer('get_image_names_and_features')
    image_names = np.load(image_name_bin)
    image_features = np.load(image_feature_bin)
    image_features = hack_image_features(image_features)
    print('all_distinct_images len:', len(image_features), file=sys.stderr)
    timer.print()
  return image_names, image_features
Пример #20
0
def main(_):
    base = FLAGS.base
    logging.set_logging_path('./mount/tmp/')
    vocab_path = os.path.join(os.path.dirname(os.path.dirname(FLAGS.input)),
                              'vocab.txt')
    ids2text.init(vocab_path)
    FLAGS.vocab = f'{base}/vocab.txt'

    tf.set_random_seed(FLAGS.random_seed)

    # FLAGS.length_index = 2
    # FLAGS.buckets = '100,400'
    # FLAGS.batch_sizes = '64,64,32'

    input_ = FLAGS.input
    if FLAGS.type == 'test':
        input_ = input_.replace('valid', 'test')

    inputs = gezi.list_files(input_)
    inputs.sort()
    if FLAGS.fold is not None:
        inputs = [
            x for x in inputs if not x.endswith('%d.record' % FLAGS.fold)
        ]

    print('type', FLAGS.type, 'inputs', inputs, file=sys.stderr)

    #dataset = Dataset('valid')
    dataset = Dataset('train')

    # balance pos neg tested ok
    dataset = dataset.make_batch(FLAGS.batch_size_, inputs, repeat=False)

    print('dataset', dataset)

    ids = []

    timer = gezi.Timer('read record')
    for i, (x, y) in enumerate(dataset):
        #if i % 10 == 1:
        #  print(x['passage'][0])
        #  print(ids2text.ids2text(x['passage'][0], sep='|'))
        #  print(ids2text.ids2text(x['candidate_pos'][0], sep='|'))
        #  print(ids2text.ids2text(x['candidate_neg'][0], sep='|'))
        #  print(x['passage'])
        #  print(x['candidate_pos'])
        #  print(type(x['id'].numpy()[0]) == bytes)
        #  break
        for id in x['id'].numpy():
            ids.append(id)
        print(i, x['type'].numpy())

    print(len(ids), len(set(ids)))
Пример #21
0
def predicts(predictor, input_texts):
  word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts]
  timer = gezi.Timer()
  texts_list, scores_list = predictor.inference(['beam_text', 'beam_text_score'], 
                                    feed_dict= {
                                      tf.get_collection('lfeed')[-1]: word_ids_list
                                      })

  for texts, scores in zip(texts_list, scores_list):
    for text, score in zip(texts, scores):
      print(text, text2ids.ids2text(text), score, math.log(score))

  print('beam_search using time(ms):', timer.elapsed_ms())
def run():
  predictor = melt.Predictor(FLAGS.model_dir)
  
  logging.info('model:%s'%predictor.model_path)
  start = 0
  timer = gezi.Timer()
  while start < FLAGS.num_images:
    end = start + FLAGS.batch_size
    end = min(FLAGS.num_images, end)
    print('predicts start:', start, 'end:', end, file=sys.stderr)
    predicts(predictor, start, end)
    start = end
  print('time:', timer.elapsed())
Пример #23
0
def main(_):
  text2ids.init()
  global_scope = ''
  if FLAGS.add_global_scope:
    global_scope = FLAGS.global_scope if FLAGS.global_scope else FLAGS.algo
 
  global sess
  sess = melt.get_session(log_device_placement=FLAGS.log_device_placement)
  with tf.variable_scope(global_scope):
    predictor =  algos_factory.gen_predictor(FLAGS.algo)
    with tf.variable_scope(FLAGS.main_scope) as scope:
      text, score, beam_text, beam_score = gen_predict_graph(predictor, scope)

  predictor.load(FLAGS.model_dir) 
  #input_text = "������������_��������ǰ��Ա���Ƭ"
  input_texts = ['���������һ�Ը�Ů�ڿ�����ջ�͸����˿¶�δ���������ڿ�Ů��-�Ա���',
                 '����̫����ô����',
                 '����������ʵ��С��ô��,����������ʵ��С���δ�ʩ',
                 '����ף�Ŀǰ4����1�굶']

  for input_text in input_texts:
    word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)

    print(word_ids)
    print(text2ids.ids2text(word_ids))

    timer = gezi.Timer()
    text_, score_ = sess.run([text, score], {predictor.input_text_place : [word_ids]})
    print(text_[0], text2ids.ids2text(text_[0]), score_[0], 'time(ms):', timer.elapsed_ms())

    timer = gezi.Timer()
    texts, scores = sess.run([beam_text, beam_score], {predictor.input_text_place : [word_ids]})

    texts = texts[0]
    scores = scores[0]
    for text_, score_ in zip(texts, scores):
      print(text_, text2ids.ids2text(text_), score_)

    print('beam_search using time(ms):', timer.elapsed_ms())
Пример #24
0
def predict(predictor, input_text, text):
  input_word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  #print('input_word_ids', input_word_ids, 'len:', len(input_word_ids))
  #print(text2ids.ids2text(input_word_ids))
  word_ids = _text2ids(text, INPUT_TEXT_MAX_WORDS)
  #print('word_ids', word_ids, 'len:', len(word_ids))
  #print(text2ids.ids2text(word_ids))

  timer = gezi.Timer()
  score = predictor.inference('score', 
                              feed_dict= {
                                      tf.get_collection('lfeed')[-1]: [input_word_ids],
                                      tf.get_collection('rfeed')[-1]: [word_ids]
                                      })
  
  print('score:', score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()
  exact_score = predictor.inference('exact_score', 
                                    feed_dict= {
                                      tf.get_collection('lfeed')[-1]: [input_word_ids],
                                      tf.get_collection('rfeed')[-1]: [word_ids]
                                      })
  
  print('exact_score:', exact_score)
  print('calc score time(ms):', timer.elapsed_ms())
  
  timer = gezi.Timer()
  exact_prob = predictor.inference('exact_prob', 
                                    feed_dict= {
                                      tf.get_collection('lfeed')[-1]: [input_word_ids],
                                      tf.get_collection('rfeed')[-1]: [word_ids]
                                      })
  
  print('exact_prob:', exact_prob)
  print('calc score time(ms):', timer.elapsed_ms())
Пример #25
0
def get_image_names_and_features():
    global image_names, image_features
    if image_names is None:
        timer = gezi.Timer('get_image_names_and_features')
        if FLAGS.image_name_bin and FLAGS.image_feature_bin:
            image_names = np.load(FLAGS.image_name_bin)
            image_features = np.load(FLAGS.image_feature_bin)
        else:
            lines = open(FLAGS.image_feature_file).readlines()
            image_names = np.array([line.split('\t')[0] for line in lines])
            image_features = np.array(
                [[float(x) for x in line.split('\t')[1:1 + IMAGE_FEATURE_LEN]]
                 for line in lines])
        timer.print()
    return image_names, image_features
def predict(predictor, input_text):
  word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  timer = gezi.Timer()
  
  #print(tf.get_collection('beam_search_initial_alignments'))
  #print(tf.get_collection('beam_search_alignments'))
  init_states = predictor.inference([
                                        'beam_search_beam_size',
                                        'beam_search_initial_state', 
                                        'beam_search_initial_ids', 
                                        'beam_search_initial_logprobs',
                                        'beam_search_initial_alignments' 
                                        ], 
                                        feed_dict= {
                                          tf.get_collection('input_text_feed')[0] : [word_ids]
                                        })

  step_func = lambda input_feed, state_feed : predictor.inference([
                                        'beam_search_state', 
                                        'beam_search_ids', 
                                        'beam_search_logprobs',
                                        #'attention_alignments'
                                        'beam_search_alignments', #must use this
                                        ], 
                                        feed_dict= {
                                          #TODO...attetion still need input_text feed, see rnn_decoder.py  beam_search_step
                                          #but not hurt perfomance much because encoder is fast? Is it possible to avoid this?
                                          #anyway if no attention  will not need input_text_feed
                                          tf.get_collection('input_text_feed')[0] : [word_ids],
                                          tf.get_collection('beam_search_input_feed')[0] : input_feed,
                                          tf.get_collection('beam_search_state_feed')[0] : state_feed
                                        })

  max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS
  beams = melt.seq2seq.beam_search(init_states, 
                                   step_func, 
                                   end_id=text2ids.end_id(), 
                                   max_words=max_words, 
                                   length_normalization_factor=0.)

  for i, beam in enumerate(beams):
    print(i, beam.words, text2ids.ids2text(beam.words), math.exp(beam.logprob), beam.logprob, beam.score, beam.logprobs)
    print(beam.alignments_list)

  print('beam search using time(ms):', timer.elapsed_ms())
Пример #27
0
    def gen_input(self, train_only=False):
        timer = gezi.Timer('gen input')

        assert not (FLAGS.feed_dict and FLAGS.dynamic_batch_length), \
              'if use feed dict then must use fixed batch length, or use buket mode(@TODO)'

        input_results = {}

        input_name_list = [self.input_train_name, self.input_train_neg_name, \
                           self.input_valid_name, self.fixed_input_valid_name, \
                           self.input_valid_neg_name]

        for name in input_name_list:
            input_results[name] = None

        assert FLAGS.shuffle_then_decode, "since use sparse data for text, must shuffle then decode"

        inputs, decode_fn, decode_neg_fn = \
         input.get_decodes(use_neg=(FLAGS.num_negs > 0))

        input_results[self.input_train_name], trainset = self.gen_train_input(
            inputs, decode_fn)

        if decode_neg_fn is not None:
            input_results[
                self.input_train_neg_name] = self.gen_train_neg_input(
                    inputs, decode_neg_fn, trainset)

        if not train_only:
            #---------------------- valid
            train_with_validation = bool(FLAGS.valid_input)
            self.train_with_validation = train_with_validation
            print('train_with_validation:', train_with_validation)
            if train_with_validation:
                input_results[self.input_valid_name], \
                input_results[self.fixed_input_valid_name], \
                eval_batch_size = self.gen_valid_input(inputs, decode_fn)

                if decode_neg_fn is not None:
                    input_results[
                        self.input_valid_neg_name] = self.gen_valid_neg_input(
                            inputs, decode_neg_fn, trainset, eval_batch_size)

        print_input_results(input_results)

        timer.print()

        return input_results
Пример #28
0
def doc(text):
    import spacy
    global full_nlp
    if full_nlp is None:
        # TODO FIXME
        #full_nlp = spacy.load("en")
        timer = gezi.Timer('load spacy model')
        full_nlp = spacy.load(
            '/usr/local/lib/python3.5/dist-packages/spacy/data/en_core_web_md-2.0.0/'
        )
        timer.print_elapsed()

    if six.PY2:
        text = text.decode('utf-8')
    doc = full_nlp(text)
    return doc
Пример #29
0
def evaluate_score():
    text_max_words = evaluator.all_distinct_texts.shape[1]
    print('text_max_words:', text_max_words)
    predictor = melt.Predictor(FLAGS.model_dir)
    timer = gezi.Timer()
    start = 0
    while start < FLAGS.num_examples:
        end = start + FLAGS.batch_size
        if end > FLAGS.num_examples:
            end = FLAGS.num_examples
        print('predicts start:', start, 'end:', end, file=sys.stderr)
        predicts(predictor, start, end)
        start = end

    melt.print_results(rank_metrics.get_metrics(), rank_metrics.get_names())
    print('predict using time:', timer.elapsed())
Пример #30
0
def predicts(predictor, input_texts):
  word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts]
  timer = gezi.Timer()
  texts_list, scores_list = predictor.inference(['beam_text', 'beam_text_score'], 
                                    feed_dict= {
                                      'seq2seq/model_init_1/input_text:0': word_ids_list
                                      })

  for texts, scores in zip(texts_list, scores_list):
    for text, score in zip(texts, scores):
      if text[0] == text2ids.vocab.unk_id():
        continue
      print(''.join(text2ids.ids2words(text, print_end=False)))
      break

  print('beam_search using time(ms):', timer.elapsed_ms())