continue word_ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr) #print('\t'.join(words), file=sys.stderr) print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if writer is not None: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) else: count += 1 if FLAGS.mode != 1: if writer is not None:
texts = np.load(FLAGS.dir + '/texts.npy') text_strs = np.load(FLAGS.dir + '/text_strs.npy') distinct_texts = [] distinct_text_strs = [] maxlen = 0 for text in texts: if len(text) > maxlen: maxlen = len(text) text_set = set() for text, text_str in zip(list(texts), list(text_strs)): if text_str not in text_set: text_set.add(text_str) distinct_texts.append(gezi.pad(text, maxlen)) distinct_text_strs.append(text_str) if len(distinct_texts) == FLAGS.max_texts: print('stop at', FLAGS.max_texts, file=sys.stderr) break print('num ori texts:', len(texts)) print('num distinct texts:', len(distinct_texts)) distinct_texts = np.array(distinct_texts) distinct_text_strs = np.array(distinct_text_strs) if FLAGS.shuffle: distinct_texts, distinct_text_strs = gezi.unison_shuffle( distinct_texts, distinct_text_strs)
IMAGE_FEATURE_LEN = 1000 predictor = melt.Predictor('./model.ckpt-12000') vocabulary.init() vocab = vocabulary.vocab #vocab = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS) ids_list = [] text_list = [] for line in open('./test.txt'): text = line.strip().split('\t')[-1] text_list.append(text) words = line.split() ids = [vocab.id(word) for word in text.split(WORDS_SEP) if vocab.has(word) or ENCODE_UNK] ids = gezi.pad(ids, TEXT_MAX_WORDS) ids_list.append(ids) #ids_list = np.array(ids_list) def bulk_predict(predictor, images, texts): scores = predictor.inference('score', { '%s/%s'%(FLAGS.algo, FLAGS.image_feature_place): images, '%s/%s'%(FLAGS.algo, FLAGS.text_place): texts }) return scores def predict(): for line in sys.stdin: l = line.strip().split('\t') image_name = l[0] #image_feature = np.array([[float(x) for x in l[1:]]])
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature)) if len(image_feature) != IMAGE_FEATURE_LEN: print('bad line:', line) continue is_top_text = True for text in texts: if text.strip() == '': continue words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if len(word_ids) == 0: continue if is_luanma(words, word_ids): print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) image_features.append(image_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_path = os.path.join(FLAGS.image_dir, img.replace('/', '_')) encoded_image = melt.read_image(image_path) is_top_text = True for text in texts: if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) image_features.append(image_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def build_features(index): total = len(df) start, end = gezi.get_fold(total, FLAGS.num_records, index) df_ = df.iloc[start:end] num_records = 0 buffer_size = None if ( FLAGS.mark != 'train' or not FLAGS.shuffle_impressions) else FLAGS.shuffle_buffer_size ofile = f'{FLAGS.out_dir}/{FLAGS.mark}/record_{index}.TMP' folder_name = FLAGS.mark if FLAGS.neg_parts > 1: folder_name = f'{FLAGS.mark}-{FLAGS.neg_part}' os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}') ofile = f'{FLAGS.out_dir}/{FLAGS.mark}-{FLAGS.neg_part}/record_{index}.TMP' writer = melt.tfrecords.Writer(ofile, buffer_size=buffer_size) if FLAGS.mark == 'train' and FLAGS.train_by_day: # 2019 11 9 -> 11 14 num_days = 7 num_records_list = [0] * num_days ofiles = [] writers = [] for i in range(num_days): os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}-days/{i}') ofiles += [ f'{FLAGS.out_dir}/{folder_name}-days/{i}/record_{index}.TMP' ] writers += [ melt.tfrecords.Writer(ofiles[-1], buffer_size=buffer_size) ] for _, row in tqdm(df_.iterrows(), total=len(df_), ascii=True): time_ = row['time'] day = int(time_.split()[0].split('/')[1]) - 9 if FLAGS.day is not None and day != FLAGS.day: continue x = to_datetime(time_) weekday = x.weekday() hour = x.hour # timestamp = to_timestamp(x) timestamp = row['timestamp'] impressions = row['impressions'].split() impression_id = row['impression_id'] uid = uid_vocab.id(row['uid']) try: history = [ did_vocab.id(x) for x in reversed(row['history'].split()) ] except Exception: # print(row['history'], row['impression_id']) history = [] feature = {} feature['uid_'] = row['uid'] feature['uid'] = uid feature['day'] = day feature['weekday'] = weekday feature['hour'] = hour feature['impression_id'] = impression_id feature['uid_in_train'] = int(uid_vocab2.has(row['uid'])) feature['impression_len'] = len(impressions) feature['hist_len'] = len(history) feature['history'] = history if FLAGS.record_padded: feature['history'] = gezi.pad(feature['history'], FLAGS.max_history) else: feature['history'] = feature['history'][:FLAGS.max_history] if FLAGS.use_impressions: feature['impressions'] = [ did_vocab.id(x.split('-')[0]) for x in impressions ] # 当前doc的cat subcat # 当前doc的entities entity types feature['history_title_entities'] = [] feature['history_title_entity_types'] = [] feature['history_abstract_entities'] = [] feature['history_abstract_entity_types'] = [] for did in history: if did == 0: break did = did_vocab.key(did) news = news_info[did] try: title_entities = json.loads(news['title_entities']) for i, m in enumerate(title_entities): if i == 2: break entity = m['WikidataId'] feature['history_title_entities'] += [ entity_vocab.id(entity) ] feature['history_title_entity_types'] += [ entity_type_vocab.id(m['Type']) ] except Exception: pass try: abstract_entities = json.loads(news['abstract_entities']) for m in title_entities: if i == 2: break entity = m['WikidataId'] feature['history_abstract_entities'] += [ entity_vocab.id(entity) ] feature['history_abstract_entity_types'] += [ entity_type_vocab.id(m['Type']) ] except Exception: pass if FLAGS.record_padded: feature['history_title_entities'] = pad( feature['history_title_entities'], FLAGS.max_history * FLAGS.max_his_title_entities) feature['history_title_entity_types'] = gezi.pad( feature['history_title_entity_types'], FLAGS.max_history * FLAGS.max_his_title_entities) feature['history_abstract_entities'] = pad( feature['history_abstract_entities'], FLAGS.max_history * FLAGS.max_his_abstract_entities) feature['history_abstract_entity_types'] = pad( feature['history_abstract_entity_types'], FLAGS.max_history * FLAGS.max_his_abstract_entities) else: feature['history_title_entities'] = feature[ 'history_title_entities'][:FLAGS.max_history * FLAGS.max_his_title_entities] feature['history_title_entity_types'] = feature[ 'history_title_entity_types'][:FLAGS.max_history * FLAGS.max_his_title_entities] feature['history_abstract_entities'] = feature[ 'history_abstract_entities'][:FLAGS.max_history * FLAGS.max_his_abstract_entities] feature['history_abstract_entity_types'] = feature[ 'history_abstract_entity_types'][:FLAGS.max_history * FLAGS. max_his_abstract_entities] if FLAGS.neg_parts > 1: indexes = list(range(len(impressions))) np.random.shuffle(indexes) prev_cat, prev_sub_cat = X, X recall_cats, recall_sub_cats = defaultdict(int), defaultdict(int) for i, impression in enumerate(impressions): did_ = impression.split('-')[0] news = news_info[did_] cat, sub_cat = news['cat'], news['sub_cat'] recall_cats[cat] += 1 recall_sub_cats[sub_cat] += 1 for i, impression in enumerate(impressions): if '-' in impression: did_, click = impression.split('-') else: did_, click = impression, '0' click = int(click) if FLAGS.neg_parts > 1: if not click and indexes[i] % FLAGS.neg_parts != FLAGS.neg_part: continue start_timestamp = start_timestamps[did_] fresh = timestamp - start_timestamp did = did_vocab.id(did_) feature['fresh'] = fresh feature['did_in_train'] = int(did_vocab2.has(did_)) feature['click'] = click feature['did_'] = did_ feature['did'] = did feature['id'] = impression_id * 100 + i feature['position'] = i news = news_info[did_] feature['cat'] = cat_vocab.id(news['cat']) feature['sub_cat'] = scat_vocab.id(news['sub_cat']) feature['title_len'] = len(news['title'].split()) try: feature['abstract_len'] = len(news['abstract'].split()) except Exception: # Nan feature['abstract_len'] = 0 feature['title_entities'] = [] feature['title_entity_types'] = [] feature['abstract_entities'] = [] feature['abstract_entity_types'] = [] try: title_entities = json.loads(news['title_entities']) for m in title_entities: entity = m['WikidataId'] feature['title_entities'].append(entity_vocab.id(entity)) feature['title_entity_types'].append( entity_type_vocab.id(m['Type'])) except Exception: pass try: abstract_entities = json.loads(news['abstract_entities']) for m in title_entities: entity = m['WikidataId'] feature['abstract_entities'].append( entity_vocab.id(entity)) feature['abstract_entity_types'].append( entity_type_vocab.id(m['Type'])) except Exception: pass if FLAGS.record_padded: for key in ['title_entities', 'title_entity_types']: feature[key] = pad(feature[key], FLAGS.max_title_entities) for key in ['abstract_entities', 'abstract_entity_types']: feature[key] = pad(feature[key], FLAGS.max_abstract_entities) # feature['impression_prev_cat'] = prev_cat # feature['impression_prev_sub_cat'] = prev_sub_cat # prev_cat = cat_vocab.id(news['cat']) # prev_sub_cat = scat_vocab.id(news['sub_cat']) # feature['impression_cat_ratio'] = recall_cats[news['cat']] / len(impressions) # feature['impression_sub_cat_ratio'] = recall_sub_cats[news['sub_cat']] / len(impressions) if FLAGS.use_impressions: feature['impressions'] = feature['impressions'][ max(0, i - 5):min(len(impressions), i + 4)] if FLAGS.record_padded: feature['impressions'] = gezi.pad(feature['impressions'], FLAGS.max_impressions) feature_ = {} for key in feature: feature_[key] = feature[key] if isinstance(feature[key], list or tuple) and not feature[key]: feature_[key] = [X] for key in feature_: try: feature_[key] = melt.gen_feature(feature_[key]) except Exception: print(key, feature[key]) print(traceback.format_exc()) exit(0) record = tf.train.Example(features=tf.train.Features( feature=feature_)) if FLAGS.mark == 'train' and FLAGS.train_by_day: writer = writers[day] writer.write(record) if FLAGS.mark == 'train' and FLAGS.train_by_day: num_records_list[day] += 1 else: num_records += 1 if FLAGS.mark == 'train' and FLAGS.train_by_day: for i in range(num_days): writers[i].close() if num_records_list[i] == 0: os.system('rm -rf %s' % ofiles[i]) else: ofile2 = ofiles[i].replace('.TMP', f'.{num_records_list[i]}') os.system('mv %s %s' % (ofiles[i], ofile2)) else: writer.close() if num_records == 0: os.system('rm -rf %s' % ofile) else: ofile2 = ofile.replace('.TMP', f'.{num_records}') os.system('mv %s %s' % (ofile, ofile2))
def deal_file(file, thread_index): out_file = '{}/{}_{}'.format( FLAGS.output_directory, FLAGS.name, thread_index) if FLAGS.threads > 1 else '{}/{}'.format( FLAGS.output_directory, FLAGS.name) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip().split('\t') img = l[0] img_end = IMAGE_FEATURE_LEN + 1 img_feature = [float(x) for x in l[1:img_end]] texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: if text.strip() == '': continue #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict #words = segmentor.Segment(text, FLAGS.seg_method) #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK] word_ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(text) #add pos info? weght info? or @TODO add click num info example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length num += 1 texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]