示例#1
0
def _decode(example, parse, dynamic_batch_length):
    features = parse(example,
                     features={
                         'ltext_str': tf.FixedLenFeature([], tf.string),
                         'ltext': tf.VarLenFeature(tf.int64),
                         'rtext_str': tf.FixedLenFeature([], tf.string),
                         'rtext': tf.VarLenFeature(tf.int64),
                     })

    text = features['rtext']
    input_text = features['ltext']

    maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS
    text = melt.sparse_tensor_to_dense(text, maxlen)

    #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last
    #but encoding attention with long batch length will affect speed.. see if 100 1.5 batch/s while dynamic will be 3.55
    #TODO make attention masked
    input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS
    #input_maxlen = INPUT_TEXT_MAX_WORDS
    input_text = melt.sparse_tensor_to_dense(input_text, input_maxlen)

    text_str = features['rtext_str']
    input_text_str = features['ltext_str']

    try:
        image_name = features['image_name']
    except Exception:
        image_name = text_str

    return image_name, text, text_str, input_text, input_text_str
示例#2
0
文件: input.py 项目: 52nlp/hasky
def _decode(example, parse, dynamic_batch_length):
    features = parse(example,
                     features={
                         'text_str': tf.FixedLenFeature([], tf.string),
                         'text': tf.VarLenFeature(tf.int64),
                         'input_text_str': tf.FixedLenFeature([], tf.string),
                         'input_text': tf.VarLenFeature(tf.int64),
                     })

    text = features['text']
    input_text = features['input_text']

    maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS
    text = melt.sparse_tensor_to_dense(text, maxlen)

    #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last
    #input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS
    input_maxlen = INPUT_TEXT_MAX_WORDS
    input_text = melt.sparse_tensor_to_dense(input_text, input_maxlen)

    text_str = features['text_str']
    input_text_str = features['input_text_str']

    try:
        image_name = features['image_name']
    except Exception:
        image_name = text_str

    return image_name, text, text_str, input_text, input_text_str
示例#3
0
    def parser(self, example):
        """Parses a single tf.Example into image and label tensors."""
        features_dict = {
            'id': tf.FixedLenFeature([], tf.string),
            'content_str': tf.FixedLenFeature([], tf.string),
            'content': tf.VarLenFeature(tf.int64),
            'char': tf.VarLenFeature(tf.int64),
            'label': tf.FixedLenFeature([], tf.int64),
            'source': tf.FixedLenFeature([], tf.string),
        }

        features = tf.parse_single_example(example, features=features_dict)

        content = features['content']
        content = melt.sparse_tensor_to_dense(content)

        features['content'] = content
        label = features['label']

        #if FLAGS.use_char:
        chars = features['char']
        chars = melt.sparse_tensor_to_dense(chars)
        # if FLAGS.char_min_count:
        #   chars = melt.greater_then_set(chars, FLAGS.char_min_count, UNK_ID)
        features['char'] = chars

        x = features
        y = label

        return x, y
示例#4
0
def _decode(example, parse):
    features = parse(example,
                     features={
                         'ltext_str': tf.FixedLenFeature([], tf.string),
                         'ltext': tf.VarLenFeature(tf.int64),
                         'rtext_str': tf.FixedLenFeature([], tf.string),
                         'rtext': tf.VarLenFeature(tf.int64),
                     })

    ltext = features['ltext']
    rtext = features['rtext']

    #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last
    #input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS
    #lmaxlen = TEXT_MAX_WORDS
    #TODO... check affect..  for decomposable_nli as use masked softmax must use dynamic batch length for all!
    lmaxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS
    ltext = melt.sparse_tensor_to_dense(ltext, lmaxlen)

    rmaxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS
    rtext = melt.sparse_tensor_to_dense(rtext, rmaxlen)

    ltext_str = features['ltext_str']
    rtext_str = features['rtext_str']

    #--HACK TODO just to make sequence smae as image_caption image_name, image_feature, text, text_str
    return ltext_str, ltext, rtext, rtext_str
示例#5
0
def _decode(example, parse, dynamic_batch_length):
    features = parse(example,
                     features={
                         'image_name': tf.FixedLenFeature([], tf.string),
                         'url': tf.FixedLenFeature([], tf.string),
                         'text_str': tf.FixedLenFeature([], tf.string),
                         'ct0_str': tf.FixedLenFeature([], tf.string),
                         'title_str': tf.FixedLenFeature([], tf.string),
                         'real_title_str': tf.FixedLenFeature([], tf.string),
                         'text': tf.VarLenFeature(tf.int64),
                         'ct0': tf.VarLenFeature(tf.int64),
                         'title': tf.VarLenFeature(tf.int64),
                         'real_title': tf.VarLenFeature(tf.int64),
                     })

    image_name = features['image_name']
    text = features['text']
    input_type = 'real_title'
    input_text = features[input_type]

    maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS
    text = melt.sparse_tensor_to_dense(text, maxlen)

    #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last
    #input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS
    input_maxlen = INPUT_TEXT_MAX_WORDS
    input_text = melt.sparse_tensor_to_dense(input_text, input_maxlen)

    text_str = features['text_str']
    input_text_str = features['{}_str'.format(input_type)]

    return image_name, text, text_str, input_text, input_text_str
示例#6
0
def _decode(example, parse):
    features_dict = {
        'image_name': tf.FixedLenFeature([], tf.string),
        FLAGS.decode_name: tf.VarLenFeature(tf.int64),
        FLAGS.decode_str_name: tf.FixedLenFeature([], tf.string),
        'input_text': tf.VarLenFeature(tf.int64),
        'input_text_str': tf.FixedLenFeature([], tf.string),
    }

    if FLAGS.pre_calc_image_feature:
        features_dict[FLAGS.image_feature_name] = tf.FixedLenFeature(
            [IMAGE_FEATURE_LEN], tf.float32)
    else:
        features_dict['image_data'] = tf.FixedLenFeature([], dtype=tf.string)

    features = parse(example, features=features_dict)

    image_name = features['image_name']
    if FLAGS.pre_calc_image_feature:
        image_feature = features[FLAGS.image_feature_name]
    else:
        image_feature = features['image_data']

    text_str = features[FLAGS.decode_str_name]

    text = features[FLAGS.decode_name]
    maxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS
    text = melt.sparse_tensor_to_dense(text, maxlen)

    input_text_str = features['input_text_str']
    input_text = features['input_text']
    input_maxlen = 0 if FLAGS.dynamic_batch_length else INPUT_TEXT_MAX_WORDS
    input_text = melt.sparse_tensor_to_dense(input_text, maxlen)

    return image_name, image_feature, text, text_str, input_text, input_text_str
示例#7
0
    def parser(self, example):
        features_dict = {
            'id': tf.FixedLenFeature([], tf.string),
            'url': tf.FixedLenFeature([], tf.string),
            'answer': tf.FixedLenFeature([], tf.int64),
            'answer_str': tf.FixedLenFeature([], tf.string),
            'query': tf.VarLenFeature(tf.int64),
            'query_str': tf.FixedLenFeature([], tf.string),
            'passage': tf.VarLenFeature(tf.int64),
            'passage_str': tf.FixedLenFeature([], tf.string),
            'candidate_neg': tf.VarLenFeature(tf.int64),
            'candidate_pos': tf.VarLenFeature(tf.int64),
            'alternatives': tf.FixedLenFeature([], tf.string),
            'candidates': tf.FixedLenFeature([], tf.string),
            'type': tf.FixedLenFeature([], tf.int64),
        }

        features = tf.parse_single_example(example, features=features_dict)

        query = features['query']
        passage = features['passage']
        candidate_neg = features['candidate_neg']
        candidate_pos = features['candidate_pos']
        query = melt.sparse_tensor_to_dense(query)
        passage = melt.sparse_tensor_to_dense(passage)
        candidate_neg = melt.sparse_tensor_to_dense(candidate_neg)
        candidate_pos = melt.sparse_tensor_to_dense(candidate_pos)

        features['query'] = query
        features['passage'] = passage

        features['content'] = tf.concat([passage, query[1:]], 0)
        features['rcontent'] = tf.concat([query, passage[1:]], 0)

        features['candidate_neg'] = candidate_neg

        features['candidate_pos'] = candidate_pos

        answer = features['answer']

        x = features
        y = answer
        return x, y
示例#8
0
def _decode(example, parse, dynamic_batch_length):
    features = parse(example,
                     features={
                         'image_name': tf.FixedLenFeature([], tf.string),
                         'url': tf.FixedLenFeature([], tf.string),
                         'text_str': tf.FixedLenFeature([], tf.string),
                         'ct0_str': tf.FixedLenFeature([], tf.string),
                         'title_str': tf.FixedLenFeature([], tf.string),
                         'real_title_str': tf.FixedLenFeature([], tf.string),
                         'text': tf.VarLenFeature(tf.int64),
                         'ct0': tf.VarLenFeature(tf.int64),
                         'title': tf.VarLenFeature(tf.int64),
                         'real_title': tf.VarLenFeature(tf.int64),
                     })

    image_name = features['image_name']
    text = features['text']
    input_type = 'real_title'
    input_text = features[input_type]

    maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS
    text = melt.sparse_tensor_to_dense(text, maxlen)

    text, _ = melt.pad(text,
                       start_id=get_decoder_start_id(),
                       end_id=get_decoder_end_id())

    input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS
    input_text = melt.sparse_tensor_to_dense(input_text, maxlen)

    input_text, _ = melt.pad(
        input_text,
        start_id=(vocabulary.vocab.start_id()
                  if FLAGS.encode_start_mark else None),
        end_id=(encoder_end_id if FLAGS.encode_end_mark else None))

    text_str = features['text_str']
    input_text_str = features['{}_str'.format(input_type)]

    return image_name, text, text_str, input_text, input_text_str
示例#9
0
def _decode_neg(example, parse, dynamic_batch_length):
  features = parse(
      example,
      features={
          'text': tf.VarLenFeature(tf.int64),
          'text_str': tf.FixedLenFeature([], tf.string),
      })

  text = features['text']
  maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS
  text = melt.sparse_tensor_to_dense(text, maxlen)
  text_str = features['text_str']
  
  return text, text_str
示例#10
0
def _decode_neg(example, parse):
    features = parse(example,
                     features={
                         FLAGS.decode_name: tf.VarLenFeature(tf.int64),
                         FLAGS.decode_str_name: tf.FixedLenFeature([],
                                                                   tf.string),
                     })

    text = features[FLAGS.decode_name]
    maxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS
    text = melt.sparse_tensor_to_dense(text, maxlen)
    text_str = features[FLAGS.decode_str_name]

    return text, text_str
示例#11
0
  def parser(self, example):
    """Parses a single tf.Example into image and label tensors."""
    features_dict = {
      'content_str': tf.FixedLenFeature([], tf.string),
      'content': tf.VarLenFeature(tf.int64),
      'char': tf.VarLenFeature(tf.int64),
      'source':  tf.FixedLenFeature([], tf.string),
      }

    #if FLAGS.use_char:
    #features_dict['chars'] = tf.VarLenFeature(tf.int64)

    features = tf.parse_single_example(example, features=features_dict)

    content = features['content']
    content = melt.sparse_tensor_to_dense(content)
    # if FLAGS.add_start_end:
    #   content = tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), content, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0)
    # NOTICE! not work in dataset... so put to later step like in call but should do the same thing again for pytorch..
    # if FLAGS.vocab_min_count:
    #   content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID)

    features['content'] = content

    #if FLAGS.use_char:
    chars = features['char']
    chars = melt.sparse_tensor_to_dense(chars)
    # if FLAGS.char_min_count:
    #   chars = melt.greater_then_set(chars, FLAGS.char_min_count, UNK_ID)
    features['char'] = chars

    features['id'] = tf.constant(0, dtype=tf.int64)

    x = features
    y = x['content']
    
    return x, y
示例#12
0
def _decode(example, parse, dynamic_batch_length):
  features = parse(
      example,
      features={
          'image_name': tf.FixedLenFeature([], tf.string),
          'image_feature': tf.FixedLenFeature([IMAGE_FEATURE_LEN], tf.float32),
          'text': tf.VarLenFeature(tf.int64),
          'text_str': tf.FixedLenFeature([], tf.string),
      })

  image_name = features['image_name']
  image_feature = features['image_feature']
  text = features['text']
  maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS
  text = melt.sparse_tensor_to_dense(text, maxlen)
  text_str = features['text_str']
  
  return image_name, image_feature, text, text_str
示例#13
0
  def parser(self, example):
    comment_key = 'comment'

    features_dict = {
      'id':  tf.FixedLenFeature([], tf.string),
      'comment_str':  tf.FixedLenFeature([], tf.string),
      'comment_tokens_str':  tf.FixedLenFeature([], tf.string),
        comment_key: tf.VarLenFeature(tf.int64),
      'comment_chars':  tf.VarLenFeature(tf.int64),
      'comment_ngrams': tf.VarLenFeature(tf.int64),
      #'comment_fngrams': tf.VarLenFeature(tf.int64),
      'simple_chars':  tf.VarLenFeature(tf.int64),
      #'simple_ngrams': tf.VarLenFeature(tf.int64),
      'tokens_info': tf.VarLenFeature(tf.float32),
      #'comment_info': tf.VarLenFeature(tf.float32),
      #'comment_info':  tf.FixedLenFeature([NUM_COMMENT_FEATURES], tf.float32),
      'pos': tf.VarLenFeature(tf.int64),
      'tag': tf.VarLenFeature(tf.int64),
      'ner': tf.VarLenFeature(tf.int64),
      'classes': tf.FixedLenFeature([NUM_CLASSES], tf.float32),
      #'weight': tf.FixedLenFeature([1], tf.float32),
      }

    # # support weight from v17, but notice token change from v16
    # if not ('TOXIC_VERSION' in os.environ and int(os.environ['TOXIC_VERSION']) <= 16):
    #   features_dict['weight'] = tf.FixedLenFeature([1], tf.float32)

    # if FLAGS.use_word:
    #   features_dict['comment'] = tf.VarLenFeature(tf.int64)
    # if FLAGS.use_char:
    #   features_dict['comment_chars'] = tf.VarLenFeature(tf.int64)  
    # if FLAGS.use_simple_char:
    #   features_dict['simple_chars'] = tf.VarLenFeature(tf.int64)
    # if FLAGS.use_token_info:
    #   features_dict['tokens_info'] = tf.VarLenFeature(tf.float32),
    # if FLAGS.use_pos:
    #   features_dict['pos'] = tf.VarLenFeature(tf.int64)
    # if FLAGS.use_tag:
    #   features_dict['tag'] = tf.VarLenFeature(tf.int64)
    # if FLAGS.use_ner:
    #   features_dict['ner'] = tf.VarLenFeature(tf.int64)

    features = tf.parse_single_example(example, features=features_dict)

    id = features['id']

    comment = None
    comment_chars = None
    simple_chars = None
    tokens_info = None
    pos = None
    tag = None
    ner = None

    try:
      weight = features['weight'][0]
    except Exception:
      weight = tf.constant([1.])

    #----var len features
    #if FLAGS.use_word:
    comment = features[comment_key]
    comment = melt.sparse_tensor_to_dense(comment)    
    features[comment_key] = comment

    #if FLAGS.use_char:
    comment_chars = features['comment_chars']
    comment_chars = melt.sparse_tensor_to_dense(comment_chars)
    features['comment_chars'] = comment_chars

    #if FLAGS.use_token_info:
    tokens_info = features['tokens_info']
    tokens_info = melt.sparse_tensor_to_dense(tokens_info)
    features['tokens_info'] = tokens_info

    #comment_info = features['comment_info']
    #comment_info = melt.sparse_tensor_to_dense(comment_info)

    classes = features['classes']
    comment_str = features['comment_str']
    comment_tokens_str = features['comment_tokens_str']

    #----------- simple chars (per whole comment),  'what a pity' -> 'w|h|a|t| |a| |p|i|t|y'
    # TODO simple char can change to use ngram model seq or sum ngram 
    #if FLAGS.use_simple_char:
    simple_chars = features['simple_chars']
    simple_chars = melt.sparse_tensor_to_dense(simple_chars)
    features['simple_chars'] = simple_chars

    #simple_ngrams = features['simple_ngrams']
    #simple_ngrams = melt.sparse_tensor_to_dense(simple_ngrams)

    #if FLAGS.use_pos:
    pos = features['pos']
    pos = melt.sparse_tensor_to_dense(pos)
    features['pos'] = pos()

    tag = features['tag']
    tag = melt.sparse_tensor_to_dense(tag)
    features['tag'] = tag

    ner = features['ner']
    ner = melt.sparse_tensor_to_dense(ner)
    features['ner'] = ner

    comment_ngrams = features['comment_ngrams']
    comment_ngrams = melt.sparse_tensor_to_dense(comment_ngrams)
    features['comment_ngrams'] = comment_ngrams

    # comment_fngrams = features['comment_fngrams']
    # comment_fngrams = melt.sparse_tensor_to_dense(comment_fngrams)

    char_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt'))

    #--- will this be slow then after padding slice ?  
    #--- notice here will be shape(,) 1 d, since is parse_single_example then will batch in dtaset.padded_batch
    #--- not used much, actually, just limit max length when building tfrecords (for toxic can not limit)
    #--- then when train use bucket method like buckets=[400] will be fine
    #--- limit length , might be better do int when gen tf record
    limit = FLAGS.comment_limit if self.subset is 'train' else FLAGS.test_comment_limit
    if limit:
      comment = comment[:limit]
      comment_chars = comment_chars[:limit * FLAGS.char_limit]
      tokens_info = tokens_info[:limit * len(attribute_names)]
      if FLAGS.use_pos:
        pos = pos[:limit]
        tag = tag[:limit]
        ner = ner[:limit]

    if FLAGS.use_pos:
      pos_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'pos_vocab.txt'))
      tag_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'tag_vocab.txt'))
      ner_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ner_vocab.txt'))
      def append_start_end_mark(tag, start, end):
        tag_list = [tag]
        # if FLAGS.encode_start_mark:
        #   tag_list.insert(0, tf.constant([start], dtype=tf.int64))
        # if FLAGS.encode_end_mark:
        #   tag_list.append(tf.constant([end], dtype=tf.int64))
        if len(tag_list) > 1:
          tag = tf.concat(tag_list, 0)
        return tag  
      
      pos = append_start_end_mark(pos, pos_vocab.start_id(), pos_vocab.end_id())
      tag = append_start_end_mark(tag, tag_vocab.start_id(), tag_vocab.end_id())  
      ner = append_start_end_mark(ner, ner_vocab.start_id(), ner_vocab.end_id())

    #-----------comment deal start end mark
    comment_list = [comment]
    # if FLAGS.encode_start_mark:
    #   logging.info('add encode start mark')
    #   comment_list.insert(0, tf.constant([vocabulary.start_id()], dtype=tf.int64))
    # if FLAGS.encode_end_mark:
    #   logging.info('add encode end mark')
    #   comment_list.append(tf.constant([vocabulary.end_id()], dtype=tf.int64))
    
    if len(comment_list) > 1:
      comment = tf.concat(comment_list, 0)

    char_comment_limit = FLAGS.comment_limit if FLAGS.save_char else 1

    #----------deal tokens info  # TODO tokens embedding ? maybe
    if FLAGS.use_token_info:
      tokens_info_list = [tokens_info]
      # if FLAGS.encode_start_mark:
      #   tokens_info_list.insert(0, tf.constant(attribute_default_values, dtype=tf.float32))
      # if FLAGS.encode_end_mark:
      #   tokens_info_list.append(tf.constant(attribute_default_values, dtype=tf.float32))
      
      if len(tokens_info_list) > 1:
        tokens_info = tf.concat(tokens_info_list, 0)  

    #---------comment chars
    if FLAGS.use_char:
      comment_chars_list = [comment_chars]
      # if FLAGS.encode_start_mark:
      #   #comment_chars_list.insert(0, tf.ones([FLAGS.char_limit], dtype=tf.int64))
      #   # TODO below indices[15794,0] = 593 is not in [0, 593), because in merge_char_emb no start and end mark save 
      #   # Will change to use below next time merge-char-emb add start and end mark
      #   comment_chars_list.insert(0, tf.scatter_nd(tf.constant([[0]]), tf.constant([char_vocab.start_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit])))
      # if FLAGS.encode_end_mark:
      #   #comment_chars_list.append(tf.ones([FLAGS.char_limit], dtype=tf.int64))
      #   comment_chars_list.append(tf.scatter_nd(tf.constant([[0]]), tf.constant([char_vocab.end_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit])))

      if len(comment_chars_list) > 1:
        comment_chars = tf.concat(comment_chars_list, 0)  

    #---------comment ngrams
    if FLAGS.use_ngrams:
      ngram_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ngram_vocab.txt'))
      comment_ngrams_list = [comment_ngrams]
      # if FLAGS.encode_start_mark:
      #   comment_ngrams_list.insert(0, tf.scatter_nd(tf.constant([[0]]), tf.constant([ngram_vocab.start_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit])))
      # if FLAGS.encode_end_mark:
      #   comment_ngrams_list.append(tf.scatter_nd(tf.constant([[0]]), tf.constant([ngram_vocab.end_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit])))
      
      if len(comment_ngrams_list) > 1:
        comment_ngrams = tf.concat(comment_ngrams_list, 0)

    simple_chars_list = [simple_chars]
    # if FLAGS.encode_start_mark:
    #   simple_chars_list.insert(0, tf.constant([char_vocab.start_id()], dtype=tf.int64))
    # if FLAGS.encode_end_mark:
    #   simple_chars_list.append(tf.constant([char_vocab.end_id()], dtype=tf.int64))
    if len(simple_chars_list) > 1:
      simple_chars = tf.concat(simple_chars_list, 0)    


    features[comment_key] = comment
    features['comment_chars'] = comment_chars
    features['simple_chars'] = simple_chars
    features['comment_ngrams'] = comment_ngrams

    x = features
    y = classes
    return x, y
示例#14
0
文件: util.py 项目: meng-jia/wenzheng
 def sparse2dense_(features, key):
   val = features[key]
   val = melt.sparse_tensor_to_dense(val)
   features[key] = val   
示例#15
0
  def parser(self, example):
    features_dict = {
      'id':  tf.FixedLenFeature([], tf.string),
      'url':  tf.FixedLenFeature([], tf.string),
      'answer': tf.FixedLenFeature([], tf.int64),
      'answer_str':  tf.FixedLenFeature([], tf.string),
      'query': tf.VarLenFeature(tf.int64),
      'query_char': tf.VarLenFeature(tf.int64),
      'query_pos': tf.VarLenFeature(tf.int64),
      'query_str':  tf.FixedLenFeature([], tf.string),
      'passage': tf.VarLenFeature(tf.int64),
      'passage_char': tf.VarLenFeature(tf.int64),
      'passage_pos': tf.VarLenFeature(tf.int64),
      'passage_str':  tf.FixedLenFeature([], tf.string),
      'candidate_neg':  tf.VarLenFeature(tf.int64),
      'candidate_neg_char': tf.VarLenFeature(tf.int64),
      'candidate_neg_pos': tf.VarLenFeature(tf.int64),
      'candidate_pos':  tf.VarLenFeature(tf.int64),
      'candidate_pos_char': tf.VarLenFeature(tf.int64),
      'candidate_pos_pos': tf.VarLenFeature(tf.int64),
      'candidate_na': tf.VarLenFeature(tf.int64),
      'candidate_na_char': tf.VarLenFeature(tf.int64),
      'candidate_na_pos': tf.VarLenFeature(tf.int64),
      'query_char': tf.VarLenFeature(tf.int64),
      'query_pos': tf.VarLenFeature(tf.int64),
      'alternatives':  tf.FixedLenFeature([], tf.string),
      'candidates':  tf.FixedLenFeature([], tf.string),
      'type':  tf.FixedLenFeature([], tf.int64),
      }

    features = tf.parse_single_example(example, features=features_dict)

    query = features['query']
    passage = features['passage']
    candidate_neg = features['candidate_neg']
    candidate_pos = features['candidate_pos']
    candidate_na = features['candidate_na']
    query = melt.sparse_tensor_to_dense(query)
    passage = melt.sparse_tensor_to_dense(passage)
    candidate_neg = melt.sparse_tensor_to_dense(candidate_neg)
    candidate_pos = melt.sparse_tensor_to_dense(candidate_pos)
    candidate_na = melt.sparse_tensor_to_dense(candidate_na)

    def s2d(name):
      x = features[name]
      x = melt.sparse_tensor_to_dense(x)
      features[name] = x

    l = ['query_char', 'query_pos', \
         'passage_char', 'passage_pos', \
         'candidate_neg_char', 'candidate_neg_pos', \
         'candidate_pos_char', 'candidate_pos_pos', \
         'candidate_na_char', 'candidate_na_pos',
        ]
    for name in l:
      s2d(name)

    # def add_start_end(text):
    #   return  tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), text, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0)

    # if FLAGS.add_start_end:
    #   query = add_start_end(query)
    features['query'] = query

    # if FLAGS.add_start_end:
    #   passage = add_start_end(passage)
    features['passage'] = passage

    # if not FLAGS.add_start_end:
    #   features['content'] = tf.concat([passage, tf.constant([vocabulary.end_id()], dtype=tf.int64), query], 0)
    #   features['rcontent'] = tf.concat([query, tf.constant([vocabulary.end_id()], dtype=tf.int64), passage], 0)
    # else:
    features['content'] = tf.concat([passage, query[1:]], 0)
    features['rcontent'] = tf.concat([query, passage[1:]], 0)    

    # if FLAGS.add_start_end:
    #   candidate_neg = add_start_end(candidate_neg)
    features['candidate_neg'] = candidate_neg

    # if FLAGS.add_start_end:
    #   candidate_pos = add_start_end(candidate_pos)
    features['candidate_pos'] = candidate_pos  

    features['candidate_na'] = candidate_na

    answer = features['answer']

    x = features
    y = answer
    return x, y
示例#16
0
    def parse(self, example):
        """Parses a single tf.Example into image and label tensors."""
        features_dict = {
            'id': tf.FixedLenFeature([], tf.string),
            'content_str': tf.FixedLenFeature([], tf.string),
            'content': tf.VarLenFeature(tf.int64),
            'char': tf.VarLenFeature(tf.int64),
            'pos': tf.VarLenFeature(tf.int64),
            'ner': tf.VarLenFeature(tf.int64),
            'wlen': tf.VarLenFeature(tf.int64),
            #'label': tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64),
            'source': tf.FixedLenFeature([], tf.string),
        }

        if FLAGS.use_soft_label:
            features_dict['label'] = tf.FixedLenFeature(
                [NUM_ATTRIBUTES * NUM_CLASSES], tf.float32)
        else:
            features_dict['label'] = tf.FixedLenFeature([NUM_ATTRIBUTES],
                                                        tf.int64)

        #if FLAGS.use_char:
        #features_dict['chars'] = tf.VarLenFeature(tf.int64)

        features = tf.parse_single_example(example, features=features_dict)

        content = features['content']
        content = melt.sparse_tensor_to_dense(content)
        # Actually not use below, for bert now use nbert tfrecords which is [first_n and last_m] so do not need content_limt 512 here
        if FLAGS.content_limit:
            # TODO now only condider bert.. whey content[0] or content[:0] content[-1] not work ? FIXME..
            start_id = vocabulary.start_id(
            ) if not FLAGS.model == 'Transformer' else 101
            end_id = vocabulary.end_id(
            ) if not FLAGS.model == 'Transformer' else 102
            # TODO now has problem ... one additional end or start...
            if not FLAGS.cut_front:
                content = tf.concat([
                    content[:FLAGS.content_limit - 1],
                    tf.constant([end_id], dtype=tf.int64)
                ], 0)
            else:
                content = tf.concat([
                    tf.constant([start_id], dtype=tf.int64),
                    content[-FLAGS.content_limit + 1:]
                ], 0)
        # if FLAGS.add_start_end:
        #   content = tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), content, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0)
        # NOTICE! not work in dataset... so put to later step like in call but should do the same thing again for pytorch..
        ## TODO can use below to do unk aug so not to have different code for tf and pytorch later
        # if FLAGS.vocab_min_count:
        # #   content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID)

        features['content'] = content
        label = features['label']

        #if FLAGS.use_char:
        chars = features['char']
        chars = melt.sparse_tensor_to_dense(chars)
        # if FLAGS.char_min_count:
        #   chars = melt.greater_then_set(chars, FLAGS.char_min_count, UNK_ID)
        features['char'] = chars

        pos = features['pos']
        pos = melt.sparse_tensor_to_dense(pos)
        # if FLAGS.tag_min_count:
        #   pos = melt.greater_then_set(pos, FLAGS.tag_min_count, UNK_ID)
        features['pos'] = pos

        ner = features['ner']
        ner = melt.sparse_tensor_to_dense(ner)
        # if FLAGS.tag_min_count:
        #   ner = melt.greater_then_set(ner, FLAGS.tag_min_count, UNK_ID)
        features['ner'] = ner

        wlen = features['wlen']
        wlen = melt.sparse_tensor_to_dense(wlen)
        features['wlen'] = wlen

        x = features
        if not FLAGS.use_soft_label:
            y = label + 2
            if FLAGS.binary_class_index is not None:
                y = tf.to_int64(tf.equal(y, FLAGS.binary_class_index))
        else:
            y = label

        return x, y
示例#17
0
 def s2d(name):
   x = features[name]
   x = melt.sparse_tensor_to_dense(x)
   features[name] = x