def deal_file(file): out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') text = l[FLAGS.text_index] input_text = l[FLAGS.input_text_index] input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method) input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) if len(input_word_ids) == 0: continue input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS] if FLAGS.pad: input_word_ids = gezi.pad(input_word_ids) words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if word_ids_length == 0: continue if is_luanma(words, word_ids): print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features(feature={ 'input_text_str': melt.bytes_feature(input_text), 'input_text': melt.int64_feature(input_word_ids), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features( { 'input_text_str': melt.bytes_feature(input_text), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( { 'input_text': melt.int64_feature_list(input_word_ids), 'text': melt.int64_feature_list(word_ids) })) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature)) if len(image_feature) != IMAGE_FEATURE_LEN: print('bad line:', line) continue input_texts = l[FLAGS.input_text_index].split('\x01') for input_text in input_texts: input_words = text2ids.Segmentor.Segment( input_text, FLAGS.seg_method) input_word_ids = text2ids.words2ids( input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) if len(input_word_ids) == 0: continue input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS] if FLAGS.pad: input_word_ids = gezi.pad(input_word_ids) is_top_text = True for text in texts: if text.strip() == '': continue words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if word_ids_length == 0: continue if is_luanma(words, word_ids): print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature( image_feature), 'input_text_str': melt.bytes_feature( input_text), 'input_text': melt.int64_feature( input_word_ids), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'input_text_str': melt.bytes_feature(input_text), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists({ 'input_text': melt.int64_feature_list(input_word_ids), 'text': melt.int64_feature_list(word_ids) })) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) image_features.append(image_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') cs = l[0] #cs simid = l[3] objurl = l[1] fromurl = l[2] keyword = l[4].split('\x01')[0] extended_keyword = l[5].split('\x01')[0] img = objurl #img = cs idl4w_end = IDL4W_FEATURE_LEN + 6 idl4w_feature = [float(x) for x in l[6:idl4w_end]] titles = l[idl4w_end + 1] descs = l[idl4w_end + 2] inception_feature = [float(x) for x in l[idl4w_end + 3:]] assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % ( len(inception_feature), cs) click_query = l[idl4w_end] show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format( click_query, extended_keyword, keyword, titles, descs) if click_query == 'noclickquery': click_query = '' #TODO now only consider click_query continue else: click_queries = click_query.split('$*$') is_top_text = True for click_query in click_queries: if click_query.strip() == '': continue text_str = '{} {}'.format(click_query, show_str) text = click_query words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) if len(word_ids) == 0: continue if is_bad(words, word_ids): #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 texts.append(word_ids) text_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) #image_features.append(image_feature) idl4w_features.append(idl4w_feature) inception_features.append(inception_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break