def main(argv): writer = tf.python_io.TFRecordWriter(argv[2]) num = 0 for line in open(argv[1]): if line[0] == '#': continue if num % 10000 == 0: print('%d lines done'%num) l = line.rstrip().split() label_index = 0 if l[0][0] == '_': label_index = 1 id = int(l[0][1:]) else: id = num label = int(l[label_index]) start = label_index + 1 feature = [float(x) for x in l[start:]] example = tf.train.Example( features=tf.train.Features( feature={ 'id': melt.int_feature(id), 'label': melt.int_feature(label), 'feature': melt.float_feature(feature), })) writer.write(example.SerializeToString()) num += 1 if FLAGS.num_examples and num == FLAGS.num_examples: break
def main(argv): writer = tf.python_io.TFRecordWriter(argv[2]) num = 0 for line in open(argv[1]): if line[0] == '#': continue if num % 10000 == 0: print('%d lines done' % num) l = line.rstrip().split() label_index = 0 if l[0][0] == '_': label_index = 1 id = int(l[0][1:]) else: id = num label = int(l[label_index]) start = label_index + 1 #notice this will be float64 not float32 feature = np.array([float(x) for x in l[start:]]) if num == 0: print('len feature', len(feature)) example = tf.train.Example(features=tf.train.Features( feature={ 'id': melt.int_feature(id), 'label': melt.int_feature(label), 'feature': melt.bytes_feature(feature.tostring()), 'length': melt.int_feature(len(feature)), })) writer.write(example.SerializeToString()) num += 1
def deal_file(file, writer, thread_index): writer = melt.tfrecords.Writer('{}/{}_{}'.format(FLAGS.output_directory, FLAGS.name, thread_index)) num = 0 for line in open(file): #if num % 1000 == 0: # print(num) l = line.rstrip().split('\t') img = l[0] #print(img) img_feature = [float(x) for x in l[1:1001]] #print(img_feature) text = l[-1].split('\x01')[0] #print(text) words = Segmentor.Segment(text) word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)] #print(word_ids) if len(word_ids) == 0: continue if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) #print(word_ids) #gtexts[thread_index].append(word_ids) #gtext_strs[thread_index].append(text) example = tf.train.Example(features=tf.train.Features(feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) print(example.SerializeToString()) num += 1
def deal_file(file, writer): num = 0 for line in open(file): if num % 1000 == 0: print('num:', num) l = line.rstrip().split('\t') img = l[0] img_feature = [float(x) for x in l[1:1001]] text = l[-1].split('\x01')[0] words = Segmentor.Segment(text) word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)] if len(word_ids) == 0: num += 1 continue if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) texts.append(word_ids) text_strs.append(text) example = tf.train.Example(features=tf.train.Features(feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) #writer.write(example.SerializeToString()) writer.write(example) num += 1
def main(argv): writer = tf.python_io.TFRecordWriter(argv[2]) num = 0 for line in open(argv[1]): if line[0] == '#': continue if num % 10000 == 0: print('%d lines done' % num) l = line.rstrip().split() label_index = 0 if l[0][0] == '_': label_index = 1 id = int(l[0][1:]) else: id = num label = int(l[label_index]) start = label_index + 1 feature = [float(x) for x in l[start:]] if FLAGS.fake_var_len: if id % 2 == 0: feature = feature[:10] if id % 3 == 0: feature = feature[:20] example = tf.train.SequenceExample( context=melt.features({ 'id': melt.int_feature(id), 'label': melt.int_feature(label) }), feature_lists=melt.feature_lists({ #see sequence_test.py use each single as a list and stack all lists(single items) #can this deal with var len sequence ? 'feature': melt.feature_list( [melt.float_feature(item) for item in feature]) #'feature': melt.feature_list(melt.float_feature(feature)) })) writer.write(example.SerializeToString()) num += 1 if FLAGS.num_examples and num == FLAGS.num_examples: break
def deal_file(file, thread_index): out_file = '{}/{}_{}'.format( FLAGS.output_directory, FLAGS.name, thread_index) if FLAGS.threads > 1 else '{}/{}'.format( FLAGS.output_directory, FLAGS.name) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip().split('\t') img = l[0] img_end = IMAGE_FEATURE_LEN + 1 img_feature = [float(x) for x in l[1:img_end]] texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict words = segmentor.Segment(text, FLAGS.seg_method) word_ids = [ vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK ] word_ids_length = len(word_ids) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(text) assert img and img_feature and word_ids and text, line assert len(img_feature) == IMAGE_FEATURE_LEN #add pos info? weght info? or @TODO add click num info example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length num += 1 texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]
def _parse_line(line, writer, thread_index=0): l = line.rstrip().split('\t') image_name = l[0] image_feature = [float(x) for x in l[1:]] if image_name not in text_map: print('image %s ignore' % image_name) return else: for text, ori_text in text_map[image_name]: word_ids = [ vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) ] if not word_ids: continue word_ids_length = len(word_ids) word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(ori_text) example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(image_name), 'image_feature': melt.float_feature(image_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(ori_text), })) #NOTICE not test here for num_threads > 1 if FLAGS.num_records: if image_name not in images: images[image_name] = 1 print(image_name, len(images)) writer.write(example.SerializeToString()) if len(images) == FLAGS.num_records: print('Done') exit(1) else: writer.write(example.SerializeToString()) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length
def deal_file(file, thread_index): out_file = '{}/{}_{}'.format( FLAGS.output_directory, FLAGS.name, thread_index) if FLAGS.threads > 1 else '{}/{}'.format( FLAGS.output_directory, FLAGS.name) writer = melt.tfrecords.Writer(out_file) num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip().split('\t') img = l[0] img_end = IMAGE_FEATURE_LEN + 1 img_feature = [float(x) for x in l[1:img_end]] texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: words = Segmentor.Segment(text) word_ids = [ vocabulary.id(word) for word in words if vocabulary.has(word) ] if len(word_ids) == 0: continue if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(text) #add pos info? weght info? or @TODO add click num info example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) num += 1 texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_dir, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): #line = line.lower() if num % 1000 == 0: print(num) if FLAGS.max_lines and num >= FLAGS.max_lines: break l = line.rstrip().split('\t') if len(l) != 2: continue ltext, rtext_list = l for rtext in rtext_list.split('\x01'): lword_ids = _text2ids(ltext, TEXT_MAX_WORDS) rword_ids = _text2ids(rtext, TEXT_MAX_WORDS) if not lword_ids or not rword_ids: continue if num % 1000 == 0: print(ltext, lword_ids, text2ids.ids2text(lword_ids), file=sys.stderr) print(rtext, rword_ids, text2ids.ids2text(rword_ids), file=sys.stderr) example = tf.train.Example(features=tf.train.Features( feature={ 'ltext_str': melt.bytes_feature(ltext), 'ltext': melt.int_feature(lword_ids), 'rtext_str': melt.bytes_feature(rtext), 'rtext': melt.int_feature(rword_ids), })) writer.write(example) if FLAGS.np_save: assert FLAGS.threads == 1 ltexts.append(lword_ids) ltext_strs.append(ltext) rtexts.append(rword_ids) rtext_strs.append(rtext) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 word_ids = lword_ids word_ids_length = len(word_ids) if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length num += 1
if num % 1000 == 0: #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr) print('\t'.join(words), file=sys.stderr) print(word_ids, file=sys.stderr) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if writer is not None: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) else: count += 1 if FLAGS.mode != 1: if writer is not None: count = writer.count print('count\t%d' % (count), file=sys.stderr) #--------for calc total count print('count\t%d' % (count)) #do not forget to close ! NOTICE if writer is not None:
def deal_file(file, thread_index): out_file = '{}/{}_{}'.format( FLAGS.output_directory, FLAGS.name, thread_index) if FLAGS.threads > 1 else '{}/{}'.format( FLAGS.output_directory, FLAGS.name) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): line = line.lower() if num % 1000 == 0: print(num) if FLAGS.max_lines and num >= FLAGS.max_lines: break l = line.strip().split('\t') #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict #words = segmentor.Segment(text, FLAGS.seg_method) #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK] #text is what to predict which is clickquery right now decoder #input text is what to predict from, encoder, here will may be ct0, title, real_title if title.strip() is '': title = real_title if clickquery.startswith('http://'): clickquery = l[3] text = clickquery word_ids = _text2ids(text, TEXT_MAX_WORDS) if not word_ids: continue if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(text) ct0_ids = _text2ids(ct0, INPUT_TEXT_MAX_WORDS) title_ids = _text2ids(title, INPUT_TEXT_MAX_WORDS) real_title_ids = _text2ids(real_title, INPUT_TEXT_MAX_WORDS) if len(ct0_ids) == 0: ct0_ids = real_title_ids ct0 = real_title if num % 1000 == 0: print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) print(ct0, ct0_ids, text2ids.ids2text(ct0_ids), file=sys.stderr) image = l[1] url = l[2] example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(image), 'url': melt.bytes_feature(url), 'text_str': melt.bytes_feature(text), 'ct0_str': melt.bytes_feature(ct0), 'title_str': melt.bytes_feature(title), 'real_title_str': melt.bytes_feature(real_title), 'text': melt.int_feature(word_ids), 'ct0': melt.int_feature(ct0_ids), 'title': melt.int_feature(title_ids), 'real_title': melt.int_feature(real_title_ids), })) writer.write(example) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 word_ids_length = len(word_ids) if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length num += 1 texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]