def deal_file(file, writer, thread_index): writer = melt.tfrecords.Writer('{}/{}_{}'.format(FLAGS.output_directory, FLAGS.name, thread_index)) num = 0 for line in open(file): #if num % 1000 == 0: # print(num) l = line.rstrip().split('\t') img = l[0] #print(img) img_feature = [float(x) for x in l[1:1001]] #print(img_feature) text = l[-1].split('\x01')[0] #print(text) words = Segmentor.Segment(text) word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)] #print(word_ids) if len(word_ids) == 0: continue if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) #print(word_ids) #gtexts[thread_index].append(word_ids) #gtext_strs[thread_index].append(text) example = tf.train.Example(features=tf.train.Features(feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) print(example.SerializeToString()) num += 1
def main(argv): writer = tf.python_io.TFRecordWriter(argv[2]) num = 0 for line in open(argv[1]): if line[0] == '#': continue if num % 10000 == 0: print('%d lines done'%num) l = line.rstrip().split() label_index = 0 if l[0][0] == '_': label_index = 1 id = int(l[0][1:]) else: id = num label = int(l[label_index]) start = label_index + 1 feature = [float(x) for x in l[start:]] example = tf.train.Example( features=tf.train.Features( feature={ 'id': melt.int_feature(id), 'label': melt.int_feature(label), 'feature': melt.float_feature(feature), })) writer.write(example.SerializeToString()) num += 1 if FLAGS.num_examples and num == FLAGS.num_examples: break
def deal_file(file, writer): num = 0 for line in open(file): if num % 1000 == 0: print('num:', num) l = line.rstrip().split('\t') img = l[0] img_feature = [float(x) for x in l[1:1001]] text = l[-1].split('\x01')[0] words = Segmentor.Segment(text) word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)] if len(word_ids) == 0: num += 1 continue if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) texts.append(word_ids) text_strs.append(text) example = tf.train.Example(features=tf.train.Features(feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) #writer.write(example.SerializeToString()) writer.write(example) num += 1
def build_features(infile): ofile = get_out_file(infile) print('----------writing to', ofile) with melt.tfrecords.Writer(ofile) as writer: for line in tqdm(open(infile)): fields = line.rstrip().split('\t') if len(fields) > 4: label = int(fields[0]) id = '{}\t{}'.format(fields[2], fields[3]) feat_id, feat_field, feat_value = dataset.get_feat_set(fields) assert len(feat_id) == len( feat_value ), "len(feat_id) == len(feat_value) -----------------" assert len(feat_id) == len(feat_field) feature = { 'label': melt.int64_feature(label), 'id': melt.bytes_feature(id), 'index': melt.int64_feature(feat_id), 'field': melt.int64_feature(feat_field), 'value': melt.float_feature(feat_value) } record = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(record) global counter with counter.get_lock(): counter.value += 1
def deal_file(file, thread_index): out_file = '{}/{}_{}'.format( FLAGS.output_directory, FLAGS.name, thread_index) if FLAGS.threads > 1 else '{}/{}'.format( FLAGS.output_directory, FLAGS.name) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip().split('\t') img = l[0] img_end = IMAGE_FEATURE_LEN + 1 img_feature = [float(x) for x in l[1:img_end]] texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict words = segmentor.Segment(text, FLAGS.seg_method) word_ids = [ vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK ] word_ids_length = len(word_ids) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(text) assert img and img_feature and word_ids and text, line assert len(img_feature) == IMAGE_FEATURE_LEN #add pos info? weght info? or @TODO add click num info example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length num += 1 texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]
def _parse_line(line, writer, thread_index=0): l = line.rstrip().split('\t') image_name = l[0] image_feature = [float(x) for x in l[1:]] if image_name not in text_map: print('image %s ignore' % image_name) return else: for text, ori_text in text_map[image_name]: word_ids = [ vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) ] if not word_ids: continue word_ids_length = len(word_ids) word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(ori_text) example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(image_name), 'image_feature': melt.float_feature(image_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(ori_text), })) #NOTICE not test here for num_threads > 1 if FLAGS.num_records: if image_name not in images: images[image_name] = 1 print(image_name, len(images)) writer.write(example.SerializeToString()) if len(images) == FLAGS.num_records: print('Done') exit(1) else: writer.write(example.SerializeToString()) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length
def main(argv): writer = tf.python_io.TFRecordWriter(argv[2]) num = 0 for line in open(argv[1]): if line[0] == '#': continue if num % 10000 == 0: print('%d lines done' % num) l = line.rstrip().split() label_index = 0 if l[0][0] == '_': label_index = 1 id = int(l[0][1:]) else: id = num label = int(l[label_index]) start = label_index + 1 feature = [float(x) for x in l[start:]] if FLAGS.fake_var_len: if id % 2 == 0: feature = feature[:10] if id % 3 == 0: feature = feature[:20] example = tf.train.SequenceExample( context=melt.features({ 'id': melt.int_feature(id), 'label': melt.int_feature(label) }), feature_lists=melt.feature_lists({ #see sequence_test.py use each single as a list and stack all lists(single items) #can this deal with var len sequence ? 'feature': melt.feature_list( [melt.float_feature(item) for item in feature]) #'feature': melt.feature_list(melt.float_feature(feature)) })) writer.write(example.SerializeToString()) num += 1 if FLAGS.num_examples and num == FLAGS.num_examples: break
def deal_file(file, thread_index): out_file = '{}/{}_{}'.format( FLAGS.output_directory, FLAGS.name, thread_index) if FLAGS.threads > 1 else '{}/{}'.format( FLAGS.output_directory, FLAGS.name) writer = melt.tfrecords.Writer(out_file) num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip().split('\t') img = l[0] img_end = IMAGE_FEATURE_LEN + 1 img_feature = [float(x) for x in l[1:img_end]] texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: words = Segmentor.Segment(text) word_ids = [ vocabulary.id(word) for word in words if vocabulary.has(word) ] if len(word_ids) == 0: continue if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(text) #add pos info? weght info? or @TODO add click num info example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) num += 1 texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]
word_ids_length = len(word_ids) if num % 1000 == 0: #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr) print('\t'.join(words), file=sys.stderr) print(word_ids, file=sys.stderr) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if writer is not None: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) else: count += 1 if FLAGS.mode != 1: if writer is not None: count = writer.count print('count\t%d' % (count), file=sys.stderr) #--------for calc total count print('count\t%d' % (count)) #do not forget to close ! NOTICE
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature)) if len(image_feature) != IMAGE_FEATURE_LEN: print('bad line:', line) continue input_texts = l[FLAGS.input_text_index].split('\x01') for input_text in input_texts: input_words = text2ids.Segmentor.Segment( input_text, FLAGS.seg_method) input_word_ids = text2ids.words2ids( input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) if len(input_word_ids) == 0: continue input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS] if FLAGS.pad: input_word_ids = gezi.pad(input_word_ids) is_top_text = True for text in texts: if text.strip() == '': continue words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if word_ids_length == 0: continue if is_luanma(words, word_ids): print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature( image_feature), 'input_text_str': melt.bytes_feature( input_text), 'input_text': melt.int64_feature( input_word_ids), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'input_text_str': melt.bytes_feature(input_text), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists({ 'input_text': melt.int64_feature_list(input_word_ids), 'text': melt.int64_feature_list(word_ids) })) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) image_features.append(image_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')] #image_feature = [0.] * IMAGE_FEATURE_LEN assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % ( img, len(image_feature)) is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) if FLAGS.small_feature: image_features.append(image_feature) else: #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.big_feature_image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def build_features(index): mode = 'train' if 'train' in FLAGS.input else 'test' out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format( mode, index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(examples) start, end = gezi.get_fold(total, FLAGS.num_records, index) ids = examples['id'].values[start:end] comments = examples['comment_text'].values[start:end] try: labels = examples[CLASSES].values[start:end] except Exception: labels = [[0.] * len(CLASSES)] * len(ids) with melt.tfrecords.Writer(out_file) as writer: for id, comment, label in tqdm(zip(ids, comments, labels)): comment_str = comment # TODO use info doc = tokenizer.tokenize(comment) comment_tokens, tokens_info = doc.tokens, doc.attributes for i in range(len(tokens_info)): tokens_info[i] = list(map(float, tokens_info[i])) if FLAGS.comment_limit: comment_tokens = comment_tokens[:FLAGS.comment_limit] tokens_info = tokens_info[:FLAGS.comment_limit] tokens_info = np.array(tokens_info) tokens_info = tokens_info.reshape(-1) tokens_info = list(tokens_info) assert len( tokens_info) == len(comment_tokens) * len(attribute_names) comment_ids = [get_id(token, vocab) for token in comment_tokens] comment_tokens_str = '|'.join( [vocab.key(id) for id in comment_ids]) label = list(map(float, label)) comment_chars = [list(token) for token in comment_tokens] char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) for i, token in enumerate(comment_chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = get_char_id(ch, char_vocab) char_ids = list(char_ids.reshape(-1)) #print(char_ids) simple_char_ids = [] num_chs = 0 for ch in list(comment): id_ = get_char_id(ch, char_vocab) #if id_ == char_vocab.unk_id(): # continue simple_char_ids.append(id_) if len(simple_char_ids) == FLAGS.simple_char_limit: break simple_chars_str = ''.join( [char_vocab.key(id) for id in simple_char_ids]) #print(simple_char_ids, simple_chars_str) record = tf.train.Example(features=tf.train.Features( feature={ "comment": melt.int64_feature(comment_ids), "tokens_info": melt.float_feature(tokens_info), "comment_chars": melt.int64_feature(char_ids), "simple_chars": melt.int64_feature(simple_char_ids), "simple_chars_str": melt.bytes_feature(simple_chars_str), "classes": melt.float_feature(label), "id": melt.bytes_feature(id), "comment_str": melt.bytes_feature(comment_str), "comment_tokens_str": melt.bytes_feature( comment_tokens_str) })) writer.write(record) global counter with counter.get_lock(): counter.value += 1 print("Build {} instances of features in total".format(writer.size())) writer.close()
def _parse_line(line, writer, thread_index = 0): l = line.rstrip().split('\t') image_name = l[0] image_feature = [float(x) for x in l[1:]] if image_name not in text_map: print('image ', image_name, 'ignore ', 'name_len ', len(image_name), len(image_name.strip())) return else: image_path = FLAGS.image_dir + '/' + image_name #print(image_path) if FLAGS.write_raw_image_bytes: with tf.gfile.FastGFile(image_path, "r") as f: encoded_image = f.read() else: encoded_image = '' #---------below will hang if multi process #try: # decoder.decode_jpeg(encoded_image) #except (tf.errors.InvalidArgumentError, AssertionError): # print("Skipping file with invalid JPEG data: %s" % image_path) # return for text, ori_text in text_map[image_name]: word_ids = [vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK] if not word_ids: continue word_ids_length = len(word_ids) word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features(feature={ 'image_name': melt.bytes_feature(image_name), 'image_data': melt.bytes_feature(encoded_image), 'image_feature': melt.float_feature(image_feature), 'text': melt.int64_feature(word_ids), 'text_str': melt.bytes_feature(ori_text), })) else: example = tf.train.SequenceExample( context=melt.features( { 'image_name': melt.bytes_feature(image_name), 'image_data': melt.bytes_feature(encoded_image), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(ori_text), }), feature_lists=melt.feature_lists( { 'text': melt.int64_feature_list(word_ids) })) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(ori_text) #NOTICE not test here for num_threads > 1 if FLAGS.num_records: if image_name not in images: images[image_name] = 1 print(image_name, len(images)) writer.write(example.SerializeToString()) if len(images) == FLAGS.num_records: print('Done') exit(1) else: writer.write(example.SerializeToString()) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') cs = l[0] #cs simid = l[3] objurl = l[1] fromurl = l[2] keyword = l[4].split('\x01')[0] extended_keyword = l[5].split('\x01')[0] img = objurl #img = cs idl4w_end = IDL4W_FEATURE_LEN + 6 idl4w_feature = [float(x) for x in l[6:idl4w_end]] titles = l[idl4w_end + 1] descs = l[idl4w_end + 2] inception_feature = [float(x) for x in l[idl4w_end + 3:]] assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % ( len(inception_feature), cs) click_query = l[idl4w_end] show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format( click_query, extended_keyword, keyword, titles, descs) if click_query == 'noclickquery': click_query = '' #TODO now only consider click_query continue else: click_queries = click_query.split('$*$') is_top_text = True for click_query in click_queries: if click_query.strip() == '': continue text_str = '{} {}'.format(click_query, show_str) text = click_query words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) if len(word_ids) == 0: continue if is_bad(words, word_ids): #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 texts.append(word_ids) text_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) #image_features.append(image_feature) idl4w_features.append(idl4w_feature) inception_features.append(inception_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def build_features(index): mode = get_mode() out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(mode, index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(examples) if not FLAGS.has_dup: start, end = gezi.get_fold(total, FLAGS.num_records, index) else: start, end = get_fold(examples['id'].values, index) ids = examples['id'].values[start: end] ids = list(map(str, ids)) comments = examples['comment_text'].values[start: end] tokens_list = examples['tokens'].values[start: end] tokens_infos = examples['attributes'].values[start: end] # TODO change to poses poses = examples['poses'].values[start: end] tags = examples['tags'].values[start: end] ners = examples['ners'].values[start: end] ori_tokens_list = examples['ori_tokens'].values[start: end] try: labels = examples[CLASSES].values[start: end] except Exception: labels = [[0.] * len(CLASSES)] * len(ids) with melt.tfrecords.Writer(out_file) as writer: for id, comment, label, comment_tokens, ori_tokens, tokens_info, pos, tag, ner in tqdm(zip(ids, comments, labels, tokens_list, ori_tokens_list, tokens_infos, poses, tags, ners)): if not isinstance(comment, str): comment = 'ok' comment_str = comment comment_tokens = comment_tokens.split(' ') tokens_info = tokens_info.split(' ') pos = pos.split(' ') tag = tag.split(' ') ner = ner.split(' ') ori_tokens = ori_tokens.split(' ') if FLAGS.comment_limit: comment_tokens = comment_tokens[:FLAGS.comment_limit] ori_tokens = ori_tokens[:FLAGS.comment_limit] tokens_info = tokens_info[:len(attribute_names) * FLAGS.comment_limit] pos_ids = [get_char_id(x, pos_vocab) for x in pos] tag_ids = [get_char_id(x, tag_vocab) for x in tag] ner_ids = [get_char_id(x, ner_vocab) for x in ner] # NOTICE comment_ids with vocab(all train + test word so no unk) if not FLAGS.lower: comment_ids = [get_id(token, vocab) for token in comment_tokens] #comment_ids_withunk = [get_id(token, unk_vocab) for token in comment_tokens] else: comment_ids = [get_id(token.lower(), vocab) for token in comment_tokens] #comment_ids_withunk = [get_id(token.lower(), unk_vocab) for token in comment_tokens] comment_tokens_str = '|'.join([vocab.key(id) for id in comment_ids]) label = list(map(float, label)) tokens_info = list(map(float, tokens_info)) #print(len(comment_ids), len(tokens_info) / len(attribute_names), len(tokens_info) / len(comment_ids)) assert len(tokens_info) == len(attribute_names) * len(comment_ids), '%d %f' %(len(comment_ids), len(tokens_info) / len(attribute_names)) #comment_chars = [list(token) for token in comment_tokens] ## CHANGE to use ori token so fu**ck will encode ** but NiggerMan to Nigger Man will all encode NiggerMan NiggerMan twice chars_list = [list(token) for token in ori_tokens] char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) assert len(comment_ids) == len(chars_list), '{} {} {} {} {}'.format((len(comment_ids), len(chars_list), comment), tokens, ori_tokens) for i, chars in enumerate(chars_list): for j, ch in enumerate(chars): if j == FLAGS.char_limit: break char_ids[i, j] = get_char_id(ch, char_vocab) char_ids = list(char_ids.reshape(-1)) #print(char_ids) # --------------simple char simple_char_ids = [] for ch in list(comment): id_ = get_char_id(ch, char_vocab) #if id_ == char_vocab.unk_id(): # continue simple_char_ids.append(id_) if len(simple_char_ids) == FLAGS.simple_char_limit: break simple_chars_str = ''.join([char_vocab.key(id) for id in simple_char_ids]) #print(simple_char_ids, simple_chars_str) # # --------------simple ngram # simple_ngrams = gezi.get_ngrams(comment) # simple_ngrams = simple_ngrams[:FLAGS.simple_char_limit * 5] # simple_ngram_ids = [get_ngram_id(ngram, ngram_vocab) for ngram in simple_ngrams] # --------------ngram ngram_ids_list = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) if not FLAGS.ftngram: #ngrams_list = [gezi.get_ngrams(token) for token in ori_tokens] if not FLAGS.ngram_lower: ngrams_list = [gezi.get_ngrams(token, FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens] else: ngrams_list = [gezi.get_ngrams(token.lower(), FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens] for i, ngrams in enumerate(ngrams_list): for j, ngram in enumerate(ngrams): if j == FLAGS.char_limit: break #assert get_ngram_id(ngram, ngram_vocab) < 20003 ngram_ids_list[i, j] = get_ngram_id(ngram, ngram_vocab) else: #for i, (token, ori_token) in enumerate(zip(comment_tokens, ori_tokens)): for i, (token, ori_token) in enumerate(zip(comment_tokens, comment_tokens)): ngram_ids = gezi.fasttext_ids(ori_token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FLAGS.ngram_max) if len(ngram_ids) >= FLAGS.char_limit: ngram_ids = gezi.fasttext_ids(token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FALGS.ngram_max) ngram_ids = ngram_ids[:FLAGS.char_limit] for j, ngram_id in enumerate(ngram_ids): ngram_ids_list[i, j] = ngram_id ngram_ids = list(ngram_ids_list.reshape(-1)) # # ---------------fngrams(full ngrams) # fngrams_list = [gezi.get_ngrams_hash(token, FLAGS.ngram_buckets, 3, 6, reserve=3) for token in ori_tokens] # fngram_ids = np.zeros([len(comment_ids), FLAGS.ngram_limit], dtype=np.int32) # for i, fngrams in enumerate(fngrams_list): # for j, fngram in enumerate(fngrams): # if j == FLAGS.ngram_limit: # break # fngram_ids[i, j] = fngram # fngram_ids = list(fngram_ids.reshape(-1)) # global info per comment 7 features comment_info = [] comment_info.append(len(ori_tokens)) comment_info.append(len(comment_tokens)) #comment_len = sum[len(x) for x in ori_tokens] comment_len = len(comment_str) comment_info.append(comment_len) comment_info.append(comment_len / (len(ori_tokens) + 1)) num_unks = len([x for x in comment_ids if x == vocab.unk_id()]) comment_info.append(num_unks) comment_info.append(num_unks / len(comment_tokens)) comment_info.append(enprob_dict[id]) record = tf.train.Example(features=tf.train.Features(feature={ "comment": melt.int64_feature(comment_ids), #"comment_withunk": melt.int64_feature(comment_ids_withunk), "tokens_info": melt.float_feature(tokens_info), "comment_info": melt.float_feature(comment_info), "pos": melt.int64_feature(pos_ids), "tag": melt.int64_feature(tag_ids), "ner": melt.int64_feature(ner_ids), "comment_chars": melt.int64_feature(char_ids), "comment_ngrams": melt.int64_feature(ngram_ids), "simple_chars": melt.int64_feature(simple_char_ids), #"simple_ngrams": melt.int64_feature(simple_ngram_ids), #"comment_fngrams": melt.int64_feature(fngram_ids), #"simple_chars_str": melt.bytes_feature(simple_chars_str), "classes": melt.float_feature(label), "id": melt.bytes_feature(id), "weight": melt.float_feature([FLAGS.weight]), "comment_str": melt.bytes_feature(comment_str), "comment_tokens_str": melt.bytes_feature(comment_tokens_str) })) writer.write(record) global counter with counter.get_lock(): counter.value += 1 print("Build {} instances of features in total".format(writer.size())) writer.close()