def _decode(example, parse, dynamic_batch_length): features = parse(example, features={ 'ltext_str': tf.FixedLenFeature([], tf.string), 'ltext': tf.VarLenFeature(tf.int64), 'rtext_str': tf.FixedLenFeature([], tf.string), 'rtext': tf.VarLenFeature(tf.int64), }) text = features['rtext'] input_text = features['ltext'] maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last #but encoding attention with long batch length will affect speed.. see if 100 1.5 batch/s while dynamic will be 3.55 #TODO make attention masked input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS #input_maxlen = INPUT_TEXT_MAX_WORDS input_text = melt.sparse_tensor_to_dense(input_text, input_maxlen) text_str = features['rtext_str'] input_text_str = features['ltext_str'] try: image_name = features['image_name'] except Exception: image_name = text_str return image_name, text, text_str, input_text, input_text_str
def _decode(example, parse, dynamic_batch_length): features = parse(example, features={ 'text_str': tf.FixedLenFeature([], tf.string), 'text': tf.VarLenFeature(tf.int64), 'input_text_str': tf.FixedLenFeature([], tf.string), 'input_text': tf.VarLenFeature(tf.int64), }) text = features['text'] input_text = features['input_text'] maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last #input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS input_maxlen = INPUT_TEXT_MAX_WORDS input_text = melt.sparse_tensor_to_dense(input_text, input_maxlen) text_str = features['text_str'] input_text_str = features['input_text_str'] try: image_name = features['image_name'] except Exception: image_name = text_str return image_name, text, text_str, input_text, input_text_str
def parser(self, example): """Parses a single tf.Example into image and label tensors.""" features_dict = { 'id': tf.FixedLenFeature([], tf.string), 'content_str': tf.FixedLenFeature([], tf.string), 'content': tf.VarLenFeature(tf.int64), 'char': tf.VarLenFeature(tf.int64), 'label': tf.FixedLenFeature([], tf.int64), 'source': tf.FixedLenFeature([], tf.string), } features = tf.parse_single_example(example, features=features_dict) content = features['content'] content = melt.sparse_tensor_to_dense(content) features['content'] = content label = features['label'] #if FLAGS.use_char: chars = features['char'] chars = melt.sparse_tensor_to_dense(chars) # if FLAGS.char_min_count: # chars = melt.greater_then_set(chars, FLAGS.char_min_count, UNK_ID) features['char'] = chars x = features y = label return x, y
def _decode(example, parse): features = parse(example, features={ 'ltext_str': tf.FixedLenFeature([], tf.string), 'ltext': tf.VarLenFeature(tf.int64), 'rtext_str': tf.FixedLenFeature([], tf.string), 'rtext': tf.VarLenFeature(tf.int64), }) ltext = features['ltext'] rtext = features['rtext'] #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last #input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS #lmaxlen = TEXT_MAX_WORDS #TODO... check affect.. for decomposable_nli as use masked softmax must use dynamic batch length for all! lmaxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS ltext = melt.sparse_tensor_to_dense(ltext, lmaxlen) rmaxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS rtext = melt.sparse_tensor_to_dense(rtext, rmaxlen) ltext_str = features['ltext_str'] rtext_str = features['rtext_str'] #--HACK TODO just to make sequence smae as image_caption image_name, image_feature, text, text_str return ltext_str, ltext, rtext, rtext_str
def _decode(example, parse, dynamic_batch_length): features = parse(example, features={ 'image_name': tf.FixedLenFeature([], tf.string), 'url': tf.FixedLenFeature([], tf.string), 'text_str': tf.FixedLenFeature([], tf.string), 'ct0_str': tf.FixedLenFeature([], tf.string), 'title_str': tf.FixedLenFeature([], tf.string), 'real_title_str': tf.FixedLenFeature([], tf.string), 'text': tf.VarLenFeature(tf.int64), 'ct0': tf.VarLenFeature(tf.int64), 'title': tf.VarLenFeature(tf.int64), 'real_title': tf.VarLenFeature(tf.int64), }) image_name = features['image_name'] text = features['text'] input_type = 'real_title' input_text = features[input_type] maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) #for attention to be numeric stabel and since encoding not affect speed, dynamic rnn encode just pack zeros at last #input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS input_maxlen = INPUT_TEXT_MAX_WORDS input_text = melt.sparse_tensor_to_dense(input_text, input_maxlen) text_str = features['text_str'] input_text_str = features['{}_str'.format(input_type)] return image_name, text, text_str, input_text, input_text_str
def _decode(example, parse): features_dict = { 'image_name': tf.FixedLenFeature([], tf.string), FLAGS.decode_name: tf.VarLenFeature(tf.int64), FLAGS.decode_str_name: tf.FixedLenFeature([], tf.string), 'input_text': tf.VarLenFeature(tf.int64), 'input_text_str': tf.FixedLenFeature([], tf.string), } if FLAGS.pre_calc_image_feature: features_dict[FLAGS.image_feature_name] = tf.FixedLenFeature( [IMAGE_FEATURE_LEN], tf.float32) else: features_dict['image_data'] = tf.FixedLenFeature([], dtype=tf.string) features = parse(example, features=features_dict) image_name = features['image_name'] if FLAGS.pre_calc_image_feature: image_feature = features[FLAGS.image_feature_name] else: image_feature = features['image_data'] text_str = features[FLAGS.decode_str_name] text = features[FLAGS.decode_name] maxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) input_text_str = features['input_text_str'] input_text = features['input_text'] input_maxlen = 0 if FLAGS.dynamic_batch_length else INPUT_TEXT_MAX_WORDS input_text = melt.sparse_tensor_to_dense(input_text, maxlen) return image_name, image_feature, text, text_str, input_text, input_text_str
def parser(self, example): features_dict = { 'id': tf.FixedLenFeature([], tf.string), 'url': tf.FixedLenFeature([], tf.string), 'answer': tf.FixedLenFeature([], tf.int64), 'answer_str': tf.FixedLenFeature([], tf.string), 'query': tf.VarLenFeature(tf.int64), 'query_str': tf.FixedLenFeature([], tf.string), 'passage': tf.VarLenFeature(tf.int64), 'passage_str': tf.FixedLenFeature([], tf.string), 'candidate_neg': tf.VarLenFeature(tf.int64), 'candidate_pos': tf.VarLenFeature(tf.int64), 'alternatives': tf.FixedLenFeature([], tf.string), 'candidates': tf.FixedLenFeature([], tf.string), 'type': tf.FixedLenFeature([], tf.int64), } features = tf.parse_single_example(example, features=features_dict) query = features['query'] passage = features['passage'] candidate_neg = features['candidate_neg'] candidate_pos = features['candidate_pos'] query = melt.sparse_tensor_to_dense(query) passage = melt.sparse_tensor_to_dense(passage) candidate_neg = melt.sparse_tensor_to_dense(candidate_neg) candidate_pos = melt.sparse_tensor_to_dense(candidate_pos) features['query'] = query features['passage'] = passage features['content'] = tf.concat([passage, query[1:]], 0) features['rcontent'] = tf.concat([query, passage[1:]], 0) features['candidate_neg'] = candidate_neg features['candidate_pos'] = candidate_pos answer = features['answer'] x = features y = answer return x, y
def _decode(example, parse, dynamic_batch_length): features = parse(example, features={ 'image_name': tf.FixedLenFeature([], tf.string), 'url': tf.FixedLenFeature([], tf.string), 'text_str': tf.FixedLenFeature([], tf.string), 'ct0_str': tf.FixedLenFeature([], tf.string), 'title_str': tf.FixedLenFeature([], tf.string), 'real_title_str': tf.FixedLenFeature([], tf.string), 'text': tf.VarLenFeature(tf.int64), 'ct0': tf.VarLenFeature(tf.int64), 'title': tf.VarLenFeature(tf.int64), 'real_title': tf.VarLenFeature(tf.int64), }) image_name = features['image_name'] text = features['text'] input_type = 'real_title' input_text = features[input_type] maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) text, _ = melt.pad(text, start_id=get_decoder_start_id(), end_id=get_decoder_end_id()) input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS input_text = melt.sparse_tensor_to_dense(input_text, maxlen) input_text, _ = melt.pad( input_text, start_id=(vocabulary.vocab.start_id() if FLAGS.encode_start_mark else None), end_id=(encoder_end_id if FLAGS.encode_end_mark else None)) text_str = features['text_str'] input_text_str = features['{}_str'.format(input_type)] return image_name, text, text_str, input_text, input_text_str
def _decode_neg(example, parse, dynamic_batch_length): features = parse( example, features={ 'text': tf.VarLenFeature(tf.int64), 'text_str': tf.FixedLenFeature([], tf.string), }) text = features['text'] maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) text_str = features['text_str'] return text, text_str
def _decode_neg(example, parse): features = parse(example, features={ FLAGS.decode_name: tf.VarLenFeature(tf.int64), FLAGS.decode_str_name: tf.FixedLenFeature([], tf.string), }) text = features[FLAGS.decode_name] maxlen = 0 if FLAGS.dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) text_str = features[FLAGS.decode_str_name] return text, text_str
def parser(self, example): """Parses a single tf.Example into image and label tensors.""" features_dict = { 'content_str': tf.FixedLenFeature([], tf.string), 'content': tf.VarLenFeature(tf.int64), 'char': tf.VarLenFeature(tf.int64), 'source': tf.FixedLenFeature([], tf.string), } #if FLAGS.use_char: #features_dict['chars'] = tf.VarLenFeature(tf.int64) features = tf.parse_single_example(example, features=features_dict) content = features['content'] content = melt.sparse_tensor_to_dense(content) # if FLAGS.add_start_end: # content = tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), content, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0) # NOTICE! not work in dataset... so put to later step like in call but should do the same thing again for pytorch.. # if FLAGS.vocab_min_count: # content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID) features['content'] = content #if FLAGS.use_char: chars = features['char'] chars = melt.sparse_tensor_to_dense(chars) # if FLAGS.char_min_count: # chars = melt.greater_then_set(chars, FLAGS.char_min_count, UNK_ID) features['char'] = chars features['id'] = tf.constant(0, dtype=tf.int64) x = features y = x['content'] return x, y
def _decode(example, parse, dynamic_batch_length): features = parse( example, features={ 'image_name': tf.FixedLenFeature([], tf.string), 'image_feature': tf.FixedLenFeature([IMAGE_FEATURE_LEN], tf.float32), 'text': tf.VarLenFeature(tf.int64), 'text_str': tf.FixedLenFeature([], tf.string), }) image_name = features['image_name'] image_feature = features['image_feature'] text = features['text'] maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) text_str = features['text_str'] return image_name, image_feature, text, text_str
def parser(self, example): comment_key = 'comment' features_dict = { 'id': tf.FixedLenFeature([], tf.string), 'comment_str': tf.FixedLenFeature([], tf.string), 'comment_tokens_str': tf.FixedLenFeature([], tf.string), comment_key: tf.VarLenFeature(tf.int64), 'comment_chars': tf.VarLenFeature(tf.int64), 'comment_ngrams': tf.VarLenFeature(tf.int64), #'comment_fngrams': tf.VarLenFeature(tf.int64), 'simple_chars': tf.VarLenFeature(tf.int64), #'simple_ngrams': tf.VarLenFeature(tf.int64), 'tokens_info': tf.VarLenFeature(tf.float32), #'comment_info': tf.VarLenFeature(tf.float32), #'comment_info': tf.FixedLenFeature([NUM_COMMENT_FEATURES], tf.float32), 'pos': tf.VarLenFeature(tf.int64), 'tag': tf.VarLenFeature(tf.int64), 'ner': tf.VarLenFeature(tf.int64), 'classes': tf.FixedLenFeature([NUM_CLASSES], tf.float32), #'weight': tf.FixedLenFeature([1], tf.float32), } # # support weight from v17, but notice token change from v16 # if not ('TOXIC_VERSION' in os.environ and int(os.environ['TOXIC_VERSION']) <= 16): # features_dict['weight'] = tf.FixedLenFeature([1], tf.float32) # if FLAGS.use_word: # features_dict['comment'] = tf.VarLenFeature(tf.int64) # if FLAGS.use_char: # features_dict['comment_chars'] = tf.VarLenFeature(tf.int64) # if FLAGS.use_simple_char: # features_dict['simple_chars'] = tf.VarLenFeature(tf.int64) # if FLAGS.use_token_info: # features_dict['tokens_info'] = tf.VarLenFeature(tf.float32), # if FLAGS.use_pos: # features_dict['pos'] = tf.VarLenFeature(tf.int64) # if FLAGS.use_tag: # features_dict['tag'] = tf.VarLenFeature(tf.int64) # if FLAGS.use_ner: # features_dict['ner'] = tf.VarLenFeature(tf.int64) features = tf.parse_single_example(example, features=features_dict) id = features['id'] comment = None comment_chars = None simple_chars = None tokens_info = None pos = None tag = None ner = None try: weight = features['weight'][0] except Exception: weight = tf.constant([1.]) #----var len features #if FLAGS.use_word: comment = features[comment_key] comment = melt.sparse_tensor_to_dense(comment) features[comment_key] = comment #if FLAGS.use_char: comment_chars = features['comment_chars'] comment_chars = melt.sparse_tensor_to_dense(comment_chars) features['comment_chars'] = comment_chars #if FLAGS.use_token_info: tokens_info = features['tokens_info'] tokens_info = melt.sparse_tensor_to_dense(tokens_info) features['tokens_info'] = tokens_info #comment_info = features['comment_info'] #comment_info = melt.sparse_tensor_to_dense(comment_info) classes = features['classes'] comment_str = features['comment_str'] comment_tokens_str = features['comment_tokens_str'] #----------- simple chars (per whole comment), 'what a pity' -> 'w|h|a|t| |a| |p|i|t|y' # TODO simple char can change to use ngram model seq or sum ngram #if FLAGS.use_simple_char: simple_chars = features['simple_chars'] simple_chars = melt.sparse_tensor_to_dense(simple_chars) features['simple_chars'] = simple_chars #simple_ngrams = features['simple_ngrams'] #simple_ngrams = melt.sparse_tensor_to_dense(simple_ngrams) #if FLAGS.use_pos: pos = features['pos'] pos = melt.sparse_tensor_to_dense(pos) features['pos'] = pos() tag = features['tag'] tag = melt.sparse_tensor_to_dense(tag) features['tag'] = tag ner = features['ner'] ner = melt.sparse_tensor_to_dense(ner) features['ner'] = ner comment_ngrams = features['comment_ngrams'] comment_ngrams = melt.sparse_tensor_to_dense(comment_ngrams) features['comment_ngrams'] = comment_ngrams # comment_fngrams = features['comment_fngrams'] # comment_fngrams = melt.sparse_tensor_to_dense(comment_fngrams) char_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt')) #--- will this be slow then after padding slice ? #--- notice here will be shape(,) 1 d, since is parse_single_example then will batch in dtaset.padded_batch #--- not used much, actually, just limit max length when building tfrecords (for toxic can not limit) #--- then when train use bucket method like buckets=[400] will be fine #--- limit length , might be better do int when gen tf record limit = FLAGS.comment_limit if self.subset is 'train' else FLAGS.test_comment_limit if limit: comment = comment[:limit] comment_chars = comment_chars[:limit * FLAGS.char_limit] tokens_info = tokens_info[:limit * len(attribute_names)] if FLAGS.use_pos: pos = pos[:limit] tag = tag[:limit] ner = ner[:limit] if FLAGS.use_pos: pos_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'pos_vocab.txt')) tag_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'tag_vocab.txt')) ner_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ner_vocab.txt')) def append_start_end_mark(tag, start, end): tag_list = [tag] # if FLAGS.encode_start_mark: # tag_list.insert(0, tf.constant([start], dtype=tf.int64)) # if FLAGS.encode_end_mark: # tag_list.append(tf.constant([end], dtype=tf.int64)) if len(tag_list) > 1: tag = tf.concat(tag_list, 0) return tag pos = append_start_end_mark(pos, pos_vocab.start_id(), pos_vocab.end_id()) tag = append_start_end_mark(tag, tag_vocab.start_id(), tag_vocab.end_id()) ner = append_start_end_mark(ner, ner_vocab.start_id(), ner_vocab.end_id()) #-----------comment deal start end mark comment_list = [comment] # if FLAGS.encode_start_mark: # logging.info('add encode start mark') # comment_list.insert(0, tf.constant([vocabulary.start_id()], dtype=tf.int64)) # if FLAGS.encode_end_mark: # logging.info('add encode end mark') # comment_list.append(tf.constant([vocabulary.end_id()], dtype=tf.int64)) if len(comment_list) > 1: comment = tf.concat(comment_list, 0) char_comment_limit = FLAGS.comment_limit if FLAGS.save_char else 1 #----------deal tokens info # TODO tokens embedding ? maybe if FLAGS.use_token_info: tokens_info_list = [tokens_info] # if FLAGS.encode_start_mark: # tokens_info_list.insert(0, tf.constant(attribute_default_values, dtype=tf.float32)) # if FLAGS.encode_end_mark: # tokens_info_list.append(tf.constant(attribute_default_values, dtype=tf.float32)) if len(tokens_info_list) > 1: tokens_info = tf.concat(tokens_info_list, 0) #---------comment chars if FLAGS.use_char: comment_chars_list = [comment_chars] # if FLAGS.encode_start_mark: # #comment_chars_list.insert(0, tf.ones([FLAGS.char_limit], dtype=tf.int64)) # # TODO below indices[15794,0] = 593 is not in [0, 593), because in merge_char_emb no start and end mark save # # Will change to use below next time merge-char-emb add start and end mark # comment_chars_list.insert(0, tf.scatter_nd(tf.constant([[0]]), tf.constant([char_vocab.start_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit]))) # if FLAGS.encode_end_mark: # #comment_chars_list.append(tf.ones([FLAGS.char_limit], dtype=tf.int64)) # comment_chars_list.append(tf.scatter_nd(tf.constant([[0]]), tf.constant([char_vocab.end_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit]))) if len(comment_chars_list) > 1: comment_chars = tf.concat(comment_chars_list, 0) #---------comment ngrams if FLAGS.use_ngrams: ngram_vocab = gezi.Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ngram_vocab.txt')) comment_ngrams_list = [comment_ngrams] # if FLAGS.encode_start_mark: # comment_ngrams_list.insert(0, tf.scatter_nd(tf.constant([[0]]), tf.constant([ngram_vocab.start_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit]))) # if FLAGS.encode_end_mark: # comment_ngrams_list.append(tf.scatter_nd(tf.constant([[0]]), tf.constant([ngram_vocab.end_id()], dtype=tf.int64), tf.constant([FLAGS.char_limit]))) if len(comment_ngrams_list) > 1: comment_ngrams = tf.concat(comment_ngrams_list, 0) simple_chars_list = [simple_chars] # if FLAGS.encode_start_mark: # simple_chars_list.insert(0, tf.constant([char_vocab.start_id()], dtype=tf.int64)) # if FLAGS.encode_end_mark: # simple_chars_list.append(tf.constant([char_vocab.end_id()], dtype=tf.int64)) if len(simple_chars_list) > 1: simple_chars = tf.concat(simple_chars_list, 0) features[comment_key] = comment features['comment_chars'] = comment_chars features['simple_chars'] = simple_chars features['comment_ngrams'] = comment_ngrams x = features y = classes return x, y
def sparse2dense_(features, key): val = features[key] val = melt.sparse_tensor_to_dense(val) features[key] = val
def parser(self, example): features_dict = { 'id': tf.FixedLenFeature([], tf.string), 'url': tf.FixedLenFeature([], tf.string), 'answer': tf.FixedLenFeature([], tf.int64), 'answer_str': tf.FixedLenFeature([], tf.string), 'query': tf.VarLenFeature(tf.int64), 'query_char': tf.VarLenFeature(tf.int64), 'query_pos': tf.VarLenFeature(tf.int64), 'query_str': tf.FixedLenFeature([], tf.string), 'passage': tf.VarLenFeature(tf.int64), 'passage_char': tf.VarLenFeature(tf.int64), 'passage_pos': tf.VarLenFeature(tf.int64), 'passage_str': tf.FixedLenFeature([], tf.string), 'candidate_neg': tf.VarLenFeature(tf.int64), 'candidate_neg_char': tf.VarLenFeature(tf.int64), 'candidate_neg_pos': tf.VarLenFeature(tf.int64), 'candidate_pos': tf.VarLenFeature(tf.int64), 'candidate_pos_char': tf.VarLenFeature(tf.int64), 'candidate_pos_pos': tf.VarLenFeature(tf.int64), 'candidate_na': tf.VarLenFeature(tf.int64), 'candidate_na_char': tf.VarLenFeature(tf.int64), 'candidate_na_pos': tf.VarLenFeature(tf.int64), 'query_char': tf.VarLenFeature(tf.int64), 'query_pos': tf.VarLenFeature(tf.int64), 'alternatives': tf.FixedLenFeature([], tf.string), 'candidates': tf.FixedLenFeature([], tf.string), 'type': tf.FixedLenFeature([], tf.int64), } features = tf.parse_single_example(example, features=features_dict) query = features['query'] passage = features['passage'] candidate_neg = features['candidate_neg'] candidate_pos = features['candidate_pos'] candidate_na = features['candidate_na'] query = melt.sparse_tensor_to_dense(query) passage = melt.sparse_tensor_to_dense(passage) candidate_neg = melt.sparse_tensor_to_dense(candidate_neg) candidate_pos = melt.sparse_tensor_to_dense(candidate_pos) candidate_na = melt.sparse_tensor_to_dense(candidate_na) def s2d(name): x = features[name] x = melt.sparse_tensor_to_dense(x) features[name] = x l = ['query_char', 'query_pos', \ 'passage_char', 'passage_pos', \ 'candidate_neg_char', 'candidate_neg_pos', \ 'candidate_pos_char', 'candidate_pos_pos', \ 'candidate_na_char', 'candidate_na_pos', ] for name in l: s2d(name) # def add_start_end(text): # return tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), text, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0) # if FLAGS.add_start_end: # query = add_start_end(query) features['query'] = query # if FLAGS.add_start_end: # passage = add_start_end(passage) features['passage'] = passage # if not FLAGS.add_start_end: # features['content'] = tf.concat([passage, tf.constant([vocabulary.end_id()], dtype=tf.int64), query], 0) # features['rcontent'] = tf.concat([query, tf.constant([vocabulary.end_id()], dtype=tf.int64), passage], 0) # else: features['content'] = tf.concat([passage, query[1:]], 0) features['rcontent'] = tf.concat([query, passage[1:]], 0) # if FLAGS.add_start_end: # candidate_neg = add_start_end(candidate_neg) features['candidate_neg'] = candidate_neg # if FLAGS.add_start_end: # candidate_pos = add_start_end(candidate_pos) features['candidate_pos'] = candidate_pos features['candidate_na'] = candidate_na answer = features['answer'] x = features y = answer return x, y
def parse(self, example): """Parses a single tf.Example into image and label tensors.""" features_dict = { 'id': tf.FixedLenFeature([], tf.string), 'content_str': tf.FixedLenFeature([], tf.string), 'content': tf.VarLenFeature(tf.int64), 'char': tf.VarLenFeature(tf.int64), 'pos': tf.VarLenFeature(tf.int64), 'ner': tf.VarLenFeature(tf.int64), 'wlen': tf.VarLenFeature(tf.int64), #'label': tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64), 'source': tf.FixedLenFeature([], tf.string), } if FLAGS.use_soft_label: features_dict['label'] = tf.FixedLenFeature( [NUM_ATTRIBUTES * NUM_CLASSES], tf.float32) else: features_dict['label'] = tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64) #if FLAGS.use_char: #features_dict['chars'] = tf.VarLenFeature(tf.int64) features = tf.parse_single_example(example, features=features_dict) content = features['content'] content = melt.sparse_tensor_to_dense(content) # Actually not use below, for bert now use nbert tfrecords which is [first_n and last_m] so do not need content_limt 512 here if FLAGS.content_limit: # TODO now only condider bert.. whey content[0] or content[:0] content[-1] not work ? FIXME.. start_id = vocabulary.start_id( ) if not FLAGS.model == 'Transformer' else 101 end_id = vocabulary.end_id( ) if not FLAGS.model == 'Transformer' else 102 # TODO now has problem ... one additional end or start... if not FLAGS.cut_front: content = tf.concat([ content[:FLAGS.content_limit - 1], tf.constant([end_id], dtype=tf.int64) ], 0) else: content = tf.concat([ tf.constant([start_id], dtype=tf.int64), content[-FLAGS.content_limit + 1:] ], 0) # if FLAGS.add_start_end: # content = tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), content, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0) # NOTICE! not work in dataset... so put to later step like in call but should do the same thing again for pytorch.. ## TODO can use below to do unk aug so not to have different code for tf and pytorch later # if FLAGS.vocab_min_count: # # content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID) features['content'] = content label = features['label'] #if FLAGS.use_char: chars = features['char'] chars = melt.sparse_tensor_to_dense(chars) # if FLAGS.char_min_count: # chars = melt.greater_then_set(chars, FLAGS.char_min_count, UNK_ID) features['char'] = chars pos = features['pos'] pos = melt.sparse_tensor_to_dense(pos) # if FLAGS.tag_min_count: # pos = melt.greater_then_set(pos, FLAGS.tag_min_count, UNK_ID) features['pos'] = pos ner = features['ner'] ner = melt.sparse_tensor_to_dense(ner) # if FLAGS.tag_min_count: # ner = melt.greater_then_set(ner, FLAGS.tag_min_count, UNK_ID) features['ner'] = ner wlen = features['wlen'] wlen = melt.sparse_tensor_to_dense(wlen) features['wlen'] = wlen x = features if not FLAGS.use_soft_label: y = label + 2 if FLAGS.binary_class_index is not None: y = tf.to_int64(tf.equal(y, FLAGS.binary_class_index)) else: y = label return x, y
def s2d(name): x = features[name] x = melt.sparse_tensor_to_dense(x) features[name] = x