def build_features(infile): ofile = get_out_file(infile) print('----------writing to', ofile) with melt.tfrecords.Writer(ofile) as writer: for line in tqdm(open(infile)): fields = line.rstrip().split('\t') if len(fields) > 4: label = int(fields[0]) id = '{}\t{}'.format(fields[2], fields[3]) feat_id, feat_field, feat_value = dataset.get_feat_set(fields) assert len(feat_id) == len( feat_value ), "len(feat_id) == len(feat_value) -----------------" assert len(feat_id) == len(feat_field) feature = { 'label': melt.int64_feature(label), 'id': melt.bytes_feature(id), 'index': melt.int64_feature(feat_id), 'field': melt.int64_feature(feat_field), 'value': melt.float_feature(feat_value) } record = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(record) global counter with counter.get_lock(): counter.value += 1
def convert_to_tfrecord(input_files, output_file): """Converts a file to TFRecords.""" print('Generating %s' % output_file) with tf.python_io.TFRecordWriter(output_file) as record_writer: for input_file in tqdm(input_files, ascii=True): id = os.path.basename(input_file)[:-4] #img = cv2.imread(input_file) img = melt.read_image(input_file) # turn to channel first #img = img.transpose(2,0,1) if 'test' not in output_file: label = m[id] else: label = -1 example = tf.train.Example(features=tf.train.Features( feature={ 'id': melt.bytes_feature(id), #'image': melt.bytes_feature(img.tobytes()), 'image': melt.bytes_feature(img), 'label': melt.int64_feature(label) })) record_writer.write(example.SerializeToString())
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature)) if len(image_feature) != IMAGE_FEATURE_LEN: print('bad line:', line) continue input_texts = l[FLAGS.input_text_index].split('\x01') for input_text in input_texts: input_words = text2ids.Segmentor.Segment( input_text, FLAGS.seg_method) input_word_ids = text2ids.words2ids( input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) if len(input_word_ids) == 0: continue input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS] if FLAGS.pad: input_word_ids = gezi.pad(input_word_ids) is_top_text = True for text in texts: if text.strip() == '': continue words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if word_ids_length == 0: continue if is_luanma(words, word_ids): print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature( image_feature), 'input_text_str': melt.bytes_feature( input_text), 'input_text': melt.int64_feature( input_word_ids), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'input_text_str': melt.bytes_feature(input_text), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists({ 'input_text': melt.int64_feature_list(input_word_ids), 'text': melt.int64_feature_list(word_ids) })) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) image_features.append(image_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')] #image_feature = [0.] * IMAGE_FEATURE_LEN assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % ( img, len(image_feature)) is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) if FLAGS.small_feature: image_features.append(image_feature) else: #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.big_feature_image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_imgtextfile(file): """ since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to convert and store image binaries from imatext(preprocess) """ out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) assert len(pic_info_map) > 0 with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] if img not in pic_info_map: continue img_text = l[-1] encoded_image = urllib.unquote_plus(img_text) text_info = pic_info_map[img] texts = text_info.split('\x01') is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) ##--well too big for encoded_image and so not consider evaluation? TODO #image_features.append(encoded_image) if FLAGS.image_dir: #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def build_features(index): mode = 'train' if 'train' in FLAGS.input else 'test' out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format( mode, index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(examples) start, end = gezi.get_fold(total, FLAGS.num_records, index) ids = examples['id'].values[start:end] comments = examples['comment_text'].values[start:end] try: labels = examples[CLASSES].values[start:end] except Exception: labels = [[0.] * len(CLASSES)] * len(ids) with melt.tfrecords.Writer(out_file) as writer: for id, comment, label in tqdm(zip(ids, comments, labels)): comment_str = comment # TODO use info doc = tokenizer.tokenize(comment) comment_tokens, tokens_info = doc.tokens, doc.attributes for i in range(len(tokens_info)): tokens_info[i] = list(map(float, tokens_info[i])) if FLAGS.comment_limit: comment_tokens = comment_tokens[:FLAGS.comment_limit] tokens_info = tokens_info[:FLAGS.comment_limit] tokens_info = np.array(tokens_info) tokens_info = tokens_info.reshape(-1) tokens_info = list(tokens_info) assert len( tokens_info) == len(comment_tokens) * len(attribute_names) comment_ids = [get_id(token, vocab) for token in comment_tokens] comment_tokens_str = '|'.join( [vocab.key(id) for id in comment_ids]) label = list(map(float, label)) comment_chars = [list(token) for token in comment_tokens] char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) for i, token in enumerate(comment_chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = get_char_id(ch, char_vocab) char_ids = list(char_ids.reshape(-1)) #print(char_ids) simple_char_ids = [] num_chs = 0 for ch in list(comment): id_ = get_char_id(ch, char_vocab) #if id_ == char_vocab.unk_id(): # continue simple_char_ids.append(id_) if len(simple_char_ids) == FLAGS.simple_char_limit: break simple_chars_str = ''.join( [char_vocab.key(id) for id in simple_char_ids]) #print(simple_char_ids, simple_chars_str) record = tf.train.Example(features=tf.train.Features( feature={ "comment": melt.int64_feature(comment_ids), "tokens_info": melt.float_feature(tokens_info), "comment_chars": melt.int64_feature(char_ids), "simple_chars": melt.int64_feature(simple_char_ids), "simple_chars_str": melt.bytes_feature(simple_chars_str), "classes": melt.float_feature(label), "id": melt.bytes_feature(id), "comment_str": melt.bytes_feature(comment_str), "comment_tokens_str": melt.bytes_feature( comment_tokens_str) })) writer.write(record) global counter with counter.get_lock(): counter.value += 1 print("Build {} instances of features in total".format(writer.size())) writer.close()
def _parse_line(line, writer, thread_index = 0): l = line.rstrip().split('\t') image_name = l[0] image_feature = [float(x) for x in l[1:]] if image_name not in text_map: print('image ', image_name, 'ignore ', 'name_len ', len(image_name), len(image_name.strip())) return else: image_path = FLAGS.image_dir + '/' + image_name #print(image_path) if FLAGS.write_raw_image_bytes: with tf.gfile.FastGFile(image_path, "r") as f: encoded_image = f.read() else: encoded_image = '' #---------below will hang if multi process #try: # decoder.decode_jpeg(encoded_image) #except (tf.errors.InvalidArgumentError, AssertionError): # print("Skipping file with invalid JPEG data: %s" % image_path) # return for text, ori_text in text_map[image_name]: word_ids = [vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK] if not word_ids: continue word_ids_length = len(word_ids) word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features(feature={ 'image_name': melt.bytes_feature(image_name), 'image_data': melt.bytes_feature(encoded_image), 'image_feature': melt.float_feature(image_feature), 'text': melt.int64_feature(word_ids), 'text_str': melt.bytes_feature(ori_text), })) else: example = tf.train.SequenceExample( context=melt.features( { 'image_name': melt.bytes_feature(image_name), 'image_data': melt.bytes_feature(encoded_image), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(ori_text), }), feature_lists=melt.feature_lists( { 'text': melt.int64_feature_list(word_ids) })) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(ori_text) #NOTICE not test here for num_threads > 1 if FLAGS.num_records: if image_name not in images: images[image_name] = 1 print(image_name, len(images)) writer.write(example.SerializeToString()) if len(images) == FLAGS.num_records: print('Done') exit(1) else: writer.write(example.SerializeToString()) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length
def deal_file(file): out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') text = l[FLAGS.text_index] input_text = l[FLAGS.input_text_index] input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method) input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) if len(input_word_ids) == 0: continue input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS] if FLAGS.pad: input_word_ids = gezi.pad(input_word_ids) words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if word_ids_length == 0: continue if is_luanma(words, word_ids): print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features(feature={ 'input_text_str': melt.bytes_feature(input_text), 'input_text': melt.int64_feature(input_word_ids), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features( { 'input_text_str': melt.bytes_feature(input_text), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( { 'input_text': melt.int64_feature_list(input_word_ids), 'text': melt.int64_feature_list(word_ids) })) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') cs = l[0] #cs simid = l[3] objurl = l[1] fromurl = l[2] keyword = l[4].split('\x01')[0] extended_keyword = l[5].split('\x01')[0] img = objurl #img = cs idl4w_end = IDL4W_FEATURE_LEN + 6 idl4w_feature = [float(x) for x in l[6:idl4w_end]] titles = l[idl4w_end + 1] descs = l[idl4w_end + 2] inception_feature = [float(x) for x in l[idl4w_end + 3:]] assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % ( len(inception_feature), cs) click_query = l[idl4w_end] show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format( click_query, extended_keyword, keyword, titles, descs) if click_query == 'noclickquery': click_query = '' #TODO now only consider click_query continue else: click_queries = click_query.split('$*$') is_top_text = True for click_query in click_queries: if click_query.strip() == '': continue text_str = '{} {}'.format(click_query, show_str) text = click_query words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) if len(word_ids) == 0: continue if is_bad(words, word_ids): #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 texts.append(word_ids) text_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) #image_features.append(image_feature) idl4w_features.append(idl4w_feature) inception_features.append(inception_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def build_features(index): mode = get_mode() out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(mode, index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(examples) if not FLAGS.has_dup: start, end = gezi.get_fold(total, FLAGS.num_records, index) else: start, end = get_fold(examples['id'].values, index) ids = examples['id'].values[start: end] ids = list(map(str, ids)) comments = examples['comment_text'].values[start: end] tokens_list = examples['tokens'].values[start: end] tokens_infos = examples['attributes'].values[start: end] # TODO change to poses poses = examples['poses'].values[start: end] tags = examples['tags'].values[start: end] ners = examples['ners'].values[start: end] ori_tokens_list = examples['ori_tokens'].values[start: end] try: labels = examples[CLASSES].values[start: end] except Exception: labels = [[0.] * len(CLASSES)] * len(ids) with melt.tfrecords.Writer(out_file) as writer: for id, comment, label, comment_tokens, ori_tokens, tokens_info, pos, tag, ner in tqdm(zip(ids, comments, labels, tokens_list, ori_tokens_list, tokens_infos, poses, tags, ners)): if not isinstance(comment, str): comment = 'ok' comment_str = comment comment_tokens = comment_tokens.split(' ') tokens_info = tokens_info.split(' ') pos = pos.split(' ') tag = tag.split(' ') ner = ner.split(' ') ori_tokens = ori_tokens.split(' ') if FLAGS.comment_limit: comment_tokens = comment_tokens[:FLAGS.comment_limit] ori_tokens = ori_tokens[:FLAGS.comment_limit] tokens_info = tokens_info[:len(attribute_names) * FLAGS.comment_limit] pos_ids = [get_char_id(x, pos_vocab) for x in pos] tag_ids = [get_char_id(x, tag_vocab) for x in tag] ner_ids = [get_char_id(x, ner_vocab) for x in ner] # NOTICE comment_ids with vocab(all train + test word so no unk) if not FLAGS.lower: comment_ids = [get_id(token, vocab) for token in comment_tokens] #comment_ids_withunk = [get_id(token, unk_vocab) for token in comment_tokens] else: comment_ids = [get_id(token.lower(), vocab) for token in comment_tokens] #comment_ids_withunk = [get_id(token.lower(), unk_vocab) for token in comment_tokens] comment_tokens_str = '|'.join([vocab.key(id) for id in comment_ids]) label = list(map(float, label)) tokens_info = list(map(float, tokens_info)) #print(len(comment_ids), len(tokens_info) / len(attribute_names), len(tokens_info) / len(comment_ids)) assert len(tokens_info) == len(attribute_names) * len(comment_ids), '%d %f' %(len(comment_ids), len(tokens_info) / len(attribute_names)) #comment_chars = [list(token) for token in comment_tokens] ## CHANGE to use ori token so fu**ck will encode ** but NiggerMan to Nigger Man will all encode NiggerMan NiggerMan twice chars_list = [list(token) for token in ori_tokens] char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) assert len(comment_ids) == len(chars_list), '{} {} {} {} {}'.format((len(comment_ids), len(chars_list), comment), tokens, ori_tokens) for i, chars in enumerate(chars_list): for j, ch in enumerate(chars): if j == FLAGS.char_limit: break char_ids[i, j] = get_char_id(ch, char_vocab) char_ids = list(char_ids.reshape(-1)) #print(char_ids) # --------------simple char simple_char_ids = [] for ch in list(comment): id_ = get_char_id(ch, char_vocab) #if id_ == char_vocab.unk_id(): # continue simple_char_ids.append(id_) if len(simple_char_ids) == FLAGS.simple_char_limit: break simple_chars_str = ''.join([char_vocab.key(id) for id in simple_char_ids]) #print(simple_char_ids, simple_chars_str) # # --------------simple ngram # simple_ngrams = gezi.get_ngrams(comment) # simple_ngrams = simple_ngrams[:FLAGS.simple_char_limit * 5] # simple_ngram_ids = [get_ngram_id(ngram, ngram_vocab) for ngram in simple_ngrams] # --------------ngram ngram_ids_list = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) if not FLAGS.ftngram: #ngrams_list = [gezi.get_ngrams(token) for token in ori_tokens] if not FLAGS.ngram_lower: ngrams_list = [gezi.get_ngrams(token, FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens] else: ngrams_list = [gezi.get_ngrams(token.lower(), FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens] for i, ngrams in enumerate(ngrams_list): for j, ngram in enumerate(ngrams): if j == FLAGS.char_limit: break #assert get_ngram_id(ngram, ngram_vocab) < 20003 ngram_ids_list[i, j] = get_ngram_id(ngram, ngram_vocab) else: #for i, (token, ori_token) in enumerate(zip(comment_tokens, ori_tokens)): for i, (token, ori_token) in enumerate(zip(comment_tokens, comment_tokens)): ngram_ids = gezi.fasttext_ids(ori_token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FLAGS.ngram_max) if len(ngram_ids) >= FLAGS.char_limit: ngram_ids = gezi.fasttext_ids(token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FALGS.ngram_max) ngram_ids = ngram_ids[:FLAGS.char_limit] for j, ngram_id in enumerate(ngram_ids): ngram_ids_list[i, j] = ngram_id ngram_ids = list(ngram_ids_list.reshape(-1)) # # ---------------fngrams(full ngrams) # fngrams_list = [gezi.get_ngrams_hash(token, FLAGS.ngram_buckets, 3, 6, reserve=3) for token in ori_tokens] # fngram_ids = np.zeros([len(comment_ids), FLAGS.ngram_limit], dtype=np.int32) # for i, fngrams in enumerate(fngrams_list): # for j, fngram in enumerate(fngrams): # if j == FLAGS.ngram_limit: # break # fngram_ids[i, j] = fngram # fngram_ids = list(fngram_ids.reshape(-1)) # global info per comment 7 features comment_info = [] comment_info.append(len(ori_tokens)) comment_info.append(len(comment_tokens)) #comment_len = sum[len(x) for x in ori_tokens] comment_len = len(comment_str) comment_info.append(comment_len) comment_info.append(comment_len / (len(ori_tokens) + 1)) num_unks = len([x for x in comment_ids if x == vocab.unk_id()]) comment_info.append(num_unks) comment_info.append(num_unks / len(comment_tokens)) comment_info.append(enprob_dict[id]) record = tf.train.Example(features=tf.train.Features(feature={ "comment": melt.int64_feature(comment_ids), #"comment_withunk": melt.int64_feature(comment_ids_withunk), "tokens_info": melt.float_feature(tokens_info), "comment_info": melt.float_feature(comment_info), "pos": melt.int64_feature(pos_ids), "tag": melt.int64_feature(tag_ids), "ner": melt.int64_feature(ner_ids), "comment_chars": melt.int64_feature(char_ids), "comment_ngrams": melt.int64_feature(ngram_ids), "simple_chars": melt.int64_feature(simple_char_ids), #"simple_ngrams": melt.int64_feature(simple_ngram_ids), #"comment_fngrams": melt.int64_feature(fngram_ids), #"simple_chars_str": melt.bytes_feature(simple_chars_str), "classes": melt.float_feature(label), "id": melt.bytes_feature(id), "weight": melt.float_feature([FLAGS.weight]), "comment_str": melt.bytes_feature(comment_str), "comment_tokens_str": melt.bytes_feature(comment_tokens_str) })) writer.write(record) global counter with counter.get_lock(): counter.value += 1 print("Build {} instances of features in total".format(writer.size())) writer.close()
def build_features(index): mode = get_mode(FLAGS.input) start_index = 0 if not FLAGS.use_fold else 1 out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format( mode, index + start_index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(df) num_records = FLAGS.num_records_ if mode in ['valid', 'test', 'dev', 'pm']: num_records = 1 start, end = gezi.get_fold(total, num_records, index) print('infile', FLAGS.input, 'out_file', out_file) max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for i in range(start, end): try: row = df.iloc[i] id = row[0] content = row[1] #print(content, type(content)) if len(content) > max_len: max_len = len(content) print('max_len', max_len) if len(content) > 3000: print(id, content) if mode not in ['test', 'valid']: continue label = list(row[2:]) #label = [x + 2 for x in label] #num_labels = len(label) content_ids = text2ids_(content) if len(content_ids) < 5 and mode not in ['test', 'valid']: continue limit = FLAGS.limit if len(content_ids) > max_num_ids: max_num_ids = len(content_ids) print('max_num_ids', max_num_ids) content_ids = content_ids[:limit] feature = { 'id': melt.bytes_feature(str(id)), 'label': melt.int64_feature(label), 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content), 'sorce': melt.bytes_feature(mode), } # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) if num % 1000 == 0: print(num) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: #print(traceback.format_exc(), file=sys.stderr) pass
def build_features(file_): mode = get_mode(FLAGS.input) out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}_{2}.tfrecord'.format( mode, os.path.basename(os.path.dirname(file_)), os.path.basename(file_)) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('infile', file_, 'out_file', out_file) num = 0 num_whether = 0 answer_len = 0 with melt.tfrecords.Writer(out_file) as writer: for line in open(file_): try: m = json.loads(line.rstrip('\n')) url = m['url'] alternatives = m['alternatives'] query_id = int(m['query_id']) passage = m['passage'] query = m['query'] # if query_id != 254146: # continue if not 'answer' in m: answer = 'unknown' else: answer = m['answer'] # candidates is neg,pos,uncertain # type 0 means true or false, type 1 means wehter candidates, type = sort_alternatives(alternatives, query) assert candidates is not None answer_id = 0 for i, candiate in enumerate(candidates): if candiate == answer: answer_id = i assert candidates is not None candidates_str = '|'.join(candidates) query_ids = text2ids_(query) passage_ids = text2ids_(passage) candidate_neg_ids = text2ids_(candidates[0]) candidate_pos_ids = text2ids_(candidates[1]) candidate_na_ids = text2ids_('无法确定') if len(candidate_pos_ids) > answer_len: answer_len = len(candidate_pos_ids) print(answer_len) if len(candidate_neg_ids) > answer_len: answer_len = len(candidate_neg_ids) print(answer_len) assert len(query_ids), line assert len(passage_ids), line limit = FLAGS.limit if len(passage_ids) > limit: print('long line', len(passage_ids), query_id) query_ids = query_ids[:limit] passage_ids = passage_ids[:limit] feature = { 'id': melt.bytes_feature(str(query_id)), 'url': melt.bytes_feature(url), 'alternatives': melt.bytes_feature(alternatives), 'candidates': melt.bytes_feature(candidates_str), 'passage': melt.int64_feature(passage_ids), 'passage_str': melt.bytes_feature(passage), 'query': melt.int64_feature(query_ids), 'query_str': melt.bytes_feature(query), 'candidate_neg': melt.int64_feature(candidate_neg_ids), 'candidate_pos': melt.int64_feature(candidate_pos_ids), 'candidate_na': melt.int64_feature(candidate_na_ids), 'answer': melt.int64_feature(answer_id), 'answer_str': melt.bytes_feature(answer), 'type': melt.int64_feature(type) } # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) #if not candidates: if num % 1000 == 0: print(num, query_id, query, type) print(alternatives, candidates) print(answer, answer_id) writer.write(record) num += 1 if type: num_whether += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(passage_ids) if FLAGS.max_examples and num >= FLAGS.max_examples: break except Exception: print(traceback.format_exc(), file=sys.stderr) print('-----------', query) print(alternatives) #break print('num_wehter:', num_whether)
def build_features(file_): mode = get_mode(FLAGS.input) out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.tfrecord'.format( mode, os.path.basename(file_).split('_')[-1]) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('infile', file_, 'out_file', out_file) max_len = 0 num = 0 num_whether = 0 answer_len = 0 with melt.tfrecords.Writer(out_file) as writer: for line in open(file_): try: m = json.loads(line.rstrip('\n')) url = m['url'] alternatives = m['alternatives'] query_id = int(m['query_id']) passage = m['passage'] query = m['query'] # if query_id != 254146: # continue if not 'answer' in m: answer = 'unknown' else: answer = m['answer'] # candidates is neg,pos,uncertain # type 0 means true or false, type 1 means wehter candidates, type = sort_alternatives(alternatives, query) assert candidates is not None answer_id = 0 for i, candiate in enumerate(candidates): if candiate == answer: answer_id = i assert candidates is not None candidates_str = '|'.join(candidates) pos = None words = m['seg_query'].split('\x09') if '|' in words[0]: try: l = [x.split('|') for x in words] words, pos = list(zip(*l)) except Exception: print(m['seg_query'].split('\x09')) if FLAGS.add_start_end_: words = gezi.add_start_end(words) if pos: if FLAGS.add_start_end_: pos = gezi.add_start_end(pos) query_ids = [vocab.id(x) for x in words] query_pos_ids = get_pos_ids(pos) query_char_ids = get_char_ids(words) pos = None words = m['seg_passage'].split('\x09') if '|' in words[0]: try: l = [x.split('|') for x in words] words, pos = list(zip(*l)) except Exception: print(m['seg_passage'].split('\x09')) if FLAGS.add_start_end_: words = gezi.add_start_end(words) if pos: if FLAGS.add_start_end_: pos = gezi.add_start_end(pos) passage_ids = [vocab.id(x) for x in words] passage_pos_ids = get_pos_ids(pos) passage_char_ids = get_char_ids(words) alternatives_list = alternatives.split('|') alternatives_segs = m['seg_alternatives'].split('|') for i, candidate in enumerate(candidates): index = alternatives_list.index(candiate) segs = alternatives_segs[index] words = segs.split('\x09') pos = None if '|' in words[0]: l = [x.split('|') for x in words] words, pos = list(zip(*l)) if FLAGS.add_start_end_: words = gezi.add_start_end(words) if pos: if FLAGS.add_start_end_: pos = gezi.add_start_end(pos) if i == 0: candidate_neg_ids = [vocab.id(x) for x in words] candidate_neg_pos_ids = get_pos_ids(pos) candidate_neg_char_ids = get_char_ids(words) elif i == 1: candidate_pos_ids = [vocab.id(x) for x in words] candidate_pos_pos_ids = get_pos_ids(pos) candidate_pos_char_ids = get_char_ids(words) else: # 无法确定 candidate_na_ids = [vocab.id(x) for x in words] candidate_na_pos_ids = get_pos_ids(pos) candidate_na_char_ids = get_char_ids(words) if len(candidate_pos_ids) > answer_len: answer_len = len(candidate_pos_ids) print(answer_len) if len(candidate_neg_ids) > answer_len: answer_len = len(candidate_neg_ids) print(answer_len) assert len(query_ids), line assert len(passage_ids), line limit = FLAGS.limit if len(passage_ids) > limit: print('long line', len(passage_ids), query_id) if len(passage_ids) > max_len: max_len = len(passage_ids) print('max_len', max_len) query_ids = query_ids[:limit] passage_ids = passage_ids[:limit] feature = { 'id': melt.bytes_feature(str(query_id)), 'url': melt.bytes_feature(url), 'alternatives': melt.bytes_feature(alternatives), 'candidates': melt.bytes_feature(candidates_str), 'passage': melt.int64_feature(passage_ids), 'passage_char': melt.int64_feature(passage_char_ids), 'passage_pos': melt.int64_feature(passage_pos_ids), 'passage_str': melt.bytes_feature(passage), 'query': melt.int64_feature(query_ids), 'query_char': melt.int64_feature(query_char_ids), 'query_pos': melt.int64_feature(query_pos_ids), 'query_str': melt.bytes_feature(query), 'candidate_neg': melt.int64_feature(candidate_neg_ids), 'candidate_neg_char': melt.int64_feature(candidate_neg_char_ids), 'candidate_neg_pos': melt.int64_feature(candidate_neg_pos_ids), 'candidate_pos': melt.int64_feature(candidate_pos_ids), 'candidate_pos_char': melt.int64_feature(candidate_pos_char_ids), 'candidate_pos_pos': melt.int64_feature(candidate_pos_pos_ids), 'candidate_na': melt.int64_feature(candidate_na_ids), 'candidate_na_char': melt.int64_feature(candidate_na_char_ids), 'candidate_na_pos': melt.int64_feature(candidate_na_pos_ids), 'answer': melt.int64_feature(answer_id), 'answer_str': melt.bytes_feature(answer), 'type': melt.int64_feature(type) } # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) #if not candidates: if num % 1000 == 0: print(num, query_id, query, type) print(alternatives, candidates) print(answer, answer_id) writer.write(record) num += 1 if type: num_whether += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(passage_ids) if FLAGS.max_examples and num >= FLAGS.max_examples: break except Exception: print(traceback.format_exc(), file=sys.stderr) print('-----------', query) print(alternatives) #break print('num_wehter:', num_whether)