def tokenize(index): comments = df['comment_text'] start, end = gezi.get_fold(len(comments), FLAGS.threads, index) #for i in tqdm(range(start, end)): with open('%s/%s_%d.txt' % (FLAGS.out_dir, name, index), 'w') as out: for i in range(start, end): if i % 1000 == 0: print(i, file=sys.stderr) sent = gezi.segment.tokenize_filter_empty(comments[i].replace( '\n', ' ')) print(' '.join(sent), file=out)
def tokenize(index): global context_tokens_list comments = df['comment_text'] start, end = gezi.get_fold(len(comments), FLAGS.threads, index) for i in tqdm(range(start, end), ascii=True): comment = comments[i] if FLAGS.special_tokenizer: context_tokens_list[i] = tokenizer.tokenize(comment).tokens else: if FLAGS.is_twitter: comment = glove_twitter_preprocess(comment) context_tokens_list[i] = [ x.lower() for x in gezi.segment.tokenize_filter_empty(comment) ]
def get_fold(ids, index): ids_ = [] ids = list(ids) ids_set = set() for id in ids: if id not in ids_set: ids_.append(id) ids_set.add(id) start_, end_ = gezi.get_fold(len(ids_), FLAGS.num_records, index) ids.append('END') ids_.append('END') start = None end = None for i in range(len(ids)): if ids[i] == ids_[start_]: start = i elif ids[i] == ids_[end_]: end = i return start, end
def run(index): df = pd.read_csv(input) ids = df['id'].values comments = df['comment_text'].values start, end = gezi.get_fold(len(comments), num_threads, index) output = input.replace('.csv', '.sents.%d.txt' % index) print(output) num = 0 with open(output, 'w') as out: for id, comment in zip(ids[start:end], comments[start:end]): if num % 1000 == 0: print(num) num += 1 doc = gezi.segment.doc(comment) for sent in doc.sents: print(id, sent.text.replace('\n', 'NEWLINE'), sep='\t', file=out)
def build_features(index): total = len(df) start, end = gezi.get_fold(total, FLAGS.num_records, index) df_ = df.iloc[start:end] num_records = 0 buffer_size = None if (FLAGS.mark != 'train' or not FLAGS.shuffle_impressions) else FLAGS.shuffle_buffer_size ofile = f'{FLAGS.out_dir}/{FLAGS.mark}/record_{index}.TMP' folder_name = FLAGS.mark if FLAGS.neg_parts > 1: folder_name = f'{FLAGS.mark}-{FLAGS.neg_part}' os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}') ofile = f'{FLAGS.out_dir}/{FLAGS.mark}-{FLAGS.neg_part}/record_{index}.TMP' writer = melt.tfrecords.Writer(ofile, buffer_size=buffer_size) if FLAGS.mark == 'train' and FLAGS.train_by_day: # 2019 11 9 -> 11 14 num_days = 7 num_records_list = [0] * num_days ofiles = [] writers = [] for i in range(num_days): os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}-days/{i}') ofiles += [f'{FLAGS.out_dir}/{FLAGS.mark}-days/{i}/record_{index}.TMP'] writers += [melt.tfrecords.Writer(ofiles[-1], buffer_size=buffer_size)] for _, row in tqdm(df_.iterrows(), total=len(df_), ascii=True): time_ = row['time'] day = int(time_.split()[0].split('/')[1]) - 9 if FLAGS.day is not None and day != FLAGS.day: continue x = to_datetime(time_) weekday = x.weekday() hour = x.hour timestamp = to_timestamp(x) impressions = row['impressions'].split() impression_id = row['impression_id'] uid = uid_vocab.id(row['uid']) try: history = [did_vocab.id(x) for x in reversed(row['history'].split())] except Exception: # print(row['history'], row['impression_id']) history = [] feature = {} feature['uid_'] = row['uid'] feature['uid'] = uid feature['day'] = day feature['weekday'] = weekday feature['hour'] = hour feature['history'] = history feature['impression_id'] = impression_id feature['uid_in_train'] = int(uid_vocab2.has(row['uid'])) feature['impression_len'] = len(impressions) feature['impressions'] = [did_vocab.id(x.split('-')[0]) for x in impressions] if FLAGS.record_padded: feature['history'] = gezi.pad(feature['history'], FLAGS.max_history) feature['impressions'] = gezi.pad(feature['impressions'], FLAGS.max_impressions) if FLAGS.neg_parts > 1: indexes = list(range(len(impressions))) np.random.shuffle(indexes) for i, impression in enumerate(impressions): if '-' in impression: did_, click = impression.split('-') else: did_, click = impression, '0' click = int(click) if FLAGS.neg_parts > 1: if not click and indexes[i] % FLAGS.neg_parts != FLAGS.neg_part: continue start_timestamp = start_timestamps[did_] fresh = timestamp - start_timestamp did = did_vocab.id(did_) feature['fresh'] = fresh feature['did_in_train'] = int(did_vocab2.has(did_)) feature['click'] = click feature['did_'] = did_ feature['did'] = did feature['id'] = impression_id * 100 + i feature['position'] = i feature_ = {} for key in feature: feature_[key] = feature[key] if isinstance(feature[key], list or tuple) and not feature[key]: feature_[key] = [X] for key in feature_: try: feature_[key] = melt.gen_feature(feature_[key]) except Exception: print(key, feature[key]) print(traceback.format_exc()) exit(0) record = tf.train.Example(features=tf.train.Features(feature=feature_)) if FLAGS.mark == 'train' and FLAGS.train_by_day: writer = writers[day] writer.write(record) if FLAGS.mark == 'train' and FLAGS.train_by_day: num_records_list[day] += 1 else: num_records += 1 if FLAGS.mark == 'train' and FLAGS.train_by_day: for i in range(num_days): writers[i].close() if num_records_list[i] == 0: os.system('rm -rf %s' % ofiles[i]) else: ofile2 = ofiles[i].replace('.TMP', f'.{num_records_list[i]}') os.system('mv %s %s' % (ofiles[i], ofile2)) else: writer.close() if num_records == 0: os.system('rm -rf %s' % ofile) else: ofile2 = ofile.replace('.TMP', f'.{num_records}') os.system('mv %s %s' % (ofile, ofile2))
def build_features(index): mode = 'train' if 'train' in FLAGS.input else 'test' out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format( mode, index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(examples) start, end = gezi.get_fold(total, FLAGS.num_records, index) ids = examples['id'].values[start:end] comments = examples['comment_text'].values[start:end] try: labels = examples[CLASSES].values[start:end] except Exception: labels = [[0.] * len(CLASSES)] * len(ids) with melt.tfrecords.Writer(out_file) as writer: for id, comment, label in tqdm(zip(ids, comments, labels)): comment_str = comment # TODO use info doc = tokenizer.tokenize(comment) comment_tokens, tokens_info = doc.tokens, doc.attributes for i in range(len(tokens_info)): tokens_info[i] = list(map(float, tokens_info[i])) if FLAGS.comment_limit: comment_tokens = comment_tokens[:FLAGS.comment_limit] tokens_info = tokens_info[:FLAGS.comment_limit] tokens_info = np.array(tokens_info) tokens_info = tokens_info.reshape(-1) tokens_info = list(tokens_info) assert len( tokens_info) == len(comment_tokens) * len(attribute_names) comment_ids = [get_id(token, vocab) for token in comment_tokens] comment_tokens_str = '|'.join( [vocab.key(id) for id in comment_ids]) label = list(map(float, label)) comment_chars = [list(token) for token in comment_tokens] char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) for i, token in enumerate(comment_chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = get_char_id(ch, char_vocab) char_ids = list(char_ids.reshape(-1)) #print(char_ids) simple_char_ids = [] num_chs = 0 for ch in list(comment): id_ = get_char_id(ch, char_vocab) #if id_ == char_vocab.unk_id(): # continue simple_char_ids.append(id_) if len(simple_char_ids) == FLAGS.simple_char_limit: break simple_chars_str = ''.join( [char_vocab.key(id) for id in simple_char_ids]) #print(simple_char_ids, simple_chars_str) record = tf.train.Example(features=tf.train.Features( feature={ "comment": melt.int64_feature(comment_ids), "tokens_info": melt.float_feature(tokens_info), "comment_chars": melt.int64_feature(char_ids), "simple_chars": melt.int64_feature(simple_char_ids), "simple_chars_str": melt.bytes_feature(simple_chars_str), "classes": melt.float_feature(label), "id": melt.bytes_feature(id), "comment_str": melt.bytes_feature(comment_str), "comment_tokens_str": melt.bytes_feature( comment_tokens_str) })) writer.write(record) global counter with counter.get_lock(): counter.value += 1 print("Build {} instances of features in total".format(writer.size())) writer.close()
def tokenize(index): global tokens_list comments = df['comment_text'] start, end = gezi.get_fold(len(comments), FLAGS.threads, index) if 'tokens' in df.columns: for i in range(start, end): # if df['id'][i] == '5bbabc3b14cc1f7f': # sent = tokenizer.full_tokenize(comments[i]) # tokens_list[i] = sent.tokens # attributes_list[i] = np.reshape(np.array([list(map(float, x)) for x in sent.attributes]), -1) # poses_list[i] = sent.poses # tags_list[i] = sent.tags # ners_list[i] = sent.ners # else: tokens_list[i] = df['tokens'][i].split(' ') attributes_list[i] = df['attributes'][i].split(' ') # if len(attributes_list[i]) != len(attribute_names) * len(tokens_list[i]) or FLAGS.modify_attribute: # sent = tokenizer.tokenize(comments[i]) # attributes_list[i] = np.reshape(np.array([list(map(float, x)) for x in sent.attributes]), -1) # assert len(attributes_list[i]) == len(attribute_names) * len(tokens_list[i]), '{} {} {} {}'.format(len(attributes_list[i]) / len(attribute_names), len(tokens_list[i]), i, df['id'][i]) poses_list[i] = df['poses'][i].split(' ') tags_list[i] = df['tags'][i].split(' ') ners_list[i] = df['ners'][i].split(' ') ori_tokens_list[i] = df['ori_tokens'][i].split(' ') else: for i in tqdm(range(start, end)): # for i in range(start, end): # if i % 1000 == 0: # print(i, file=sys.stderr) if FLAGS.full_tokenizer: #if FLAGS.simple_tokenizer: sent = tokenizer.full_tokenize( comments[i], lemmatization=FLAGS.lemmatization) # else: # sent = gezi.segment.tokenize_filter_empty(comments[i]) # if FLAGS.lower: # sent.tokens = [w.lower() for w in sent.tokens] tokens_list[i] = sent.tokens ori_tokens_list[i] = sent.ori_tokens attributes_list[i] = np.reshape( np.array([list(map(float, x)) for x in sent.attributes]), -1) poses_list[i] = sent.poses tags_list[i] = sent.tags ners_list[i] = sent.ners else: sent = tokenizer.tokenize(comments[i], lemmatization=FLAGS.lemmatization) # if FLAGS.lower: # sent.tokens = [w.lower() for w in sent.tokens] tokens_list[i] = sent.tokens ori_tokens_list[i] = sent.ori_tokens #print('----------', sent.attributes) try: attributes_list[i] = np.reshape( np.array( [list(map(float, x)) for x in sent.attributes]), -1) except Exception: print(sent.attributes) raise ValueError() poses_list[i] = ['NONE'] * len(tokens_list[i]) tags_list[i] = ['NONE'] * len(tokens_list[i]) ners_list[i] = ['NONE'] * len(tokens_list[i])
def build_features(index): mode = get_mode(FLAGS.input) start_index = FLAGS.start_index out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format( mode, index + start_index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(df) num_records = FLAGS.num_records_ ## TODO FIXME whty here still None ? FLAGS.num_records has bee modified before in main as 7 ... #print('---------', num_records, FLAGS.num_records_) if not num_records: if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm' ] or 'valid' in FLAGS.input: num_records = 1 else: num_records = 1 #print('------------------', num_records, FLAGS.num_records_) start, end = gezi.get_fold(total, num_records, index) print('total', total, 'infile', FLAGS.input, 'out_file', out_file, 'num_records', num_records, 'start', start, 'end', end) max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for i in tqdm(range(start, end), ascii=True): try: #row = df.iloc[i] row = df[i] id = str(row[0]) words = row[-1].split('\t') content = row[2] content_ori = content content = filter.filter(content) label = int(row[1]) content_ids = [vocab.id(x) for x in words] if len(content_ids) > max_len: max_len = len(content_ids) print('max_len', max_len) if len(content_ids) > FLAGS.word_limit and len( content_ids) < 5: print('{} {} {}'.format(id, len(content_ids), content_ori)) content_ids = content_ids[:FLAGS.word_limit] words = words[:FLAGS.word_limit] # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode if FLAGS.use_char: chars = [list(word) for word in words] char_ids = np.zeros([len(content_ids), FLAGS.char_limit], dtype=np.int32) vocab_ = char_vocab if char_vocab else vocab for i, token in enumerate(chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = vocab_.id(ch) char_ids = list(char_ids.reshape(-1)) if np.sum(char_ids) == 0: print('------------------------bad id', id) print(content_ids) print(words) exit(0) else: char_ids = [0] feature = { 'id': melt.bytes_feature(id), 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content_ori), 'char': melt.int64_feature(char_ids), 'source': melt.bytes_feature(mode), } feature['label'] = melt.int64_feature(label) # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: print(traceback.format_exc(), file=sys.stderr) pass
def build_features(index): mode = get_mode(FLAGS.input) start_index = FLAGS.start_index out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format( mode, index + start_index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(df) num_records = FLAGS.num_records_ if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm' ] or 'valid' in FLAGS.input: num_records = 1 start, end = gezi.get_fold(total, num_records, index) print('total', total, 'infile', FLAGS.input, 'out_file', out_file) max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for i in tqdm(range(start, end), ascii=True): try: row = df.iloc[i] id = str(row[0]) if seg_result: if id not in seg_result: print('id %s ot found in seg_result' % id) continue words = seg_result[id] if FLAGS.add_start_end_: words = gezi.add_start_end(words, FLAGS.start_mark, FLAGS.end_mark) if pos_result: pos = pos_result[id] if FLAGS.add_start_end_: pos = gezi.add_start_end(pos) if ner_result: ner = ner_result[id] if FLAGS.add_start_end_: ner = gezi.add_start_end(ner) if start_index > 0: id == 't' + id content = row[1] content_ori = content content = filter.filter(content) #label = list(row[2:]) label = [-2] * 20 #label = [x + 2 for x in label] #num_labels = len(label) if not seg_result: content_ids, words = text2ids_(content, preprocess=False, return_words=True) assert len(content_ids) == len(words) else: content_ids = [vocab.id(x) for x in words] #print(words, content_ids) #exit(0) if len(content_ids) > max_len: max_len = len(content_ids) print('max_len', max_len) if len(content_ids) > FLAGS.word_limit and len( content_ids) < 5: print('{} {} {}'.format(id, len(content_ids), content_ori)) #if len(content_ids) > FLAGS.word_limit: # print(id, content) # if mode not in ['test', 'valid']: # continue #if len(content_ids) < 5 and mode not in ['test', 'valid']: # continue content_ids = content_ids[:FLAGS.word_limit] words = words[:FLAGS.word_limit] # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode if FLAGS.use_char: chars = [list(word) for word in words] char_ids = np.zeros([len(content_ids), FLAGS.char_limit], dtype=np.int32) vocab_ = char_vocab if char_vocab else vocab for i, token in enumerate(chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = vocab_.id(ch) char_ids = list(char_ids.reshape(-1)) if np.sum(char_ids) == 0: print('------------------------bad id', id) print(content_ids) print(words) exit(0) else: char_ids = [0] if pos_vocab: assert pos pos = pos[:FLAGS.word_limit] pos_ids = [pos_vocab.id(x) for x in pos] else: pos_ids = [0] if ner_vocab: assert ner if pos_vocab: assert len(pos) == len(ner) ner = ner[:FLAGS.word_limit] ner_ids = [ner_vocab.id(x) for x in ner] else: ner_ids = [0] wlen = [len(word) for word in words] feature = { 'id': melt.bytes_feature(id), 'label': melt.int64_feature(label), 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content_ori), 'char': melt.int64_feature(char_ids), 'pos': melt.int64_feature( pos_ids), # might also be postion info for mix seg 'ner': melt.int64_feature(ner_ids), 'wlen': melt.int64_feature(wlen), 'source': melt.bytes_feature(mode), } # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: print(traceback.format_exc(), file=sys.stderr) pass
def build_features(index): mode = get_mode() out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(mode, index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(examples) if not FLAGS.has_dup: start, end = gezi.get_fold(total, FLAGS.num_records, index) else: start, end = get_fold(examples['id'].values, index) ids = examples['id'].values[start: end] ids = list(map(str, ids)) comments = examples['comment_text'].values[start: end] tokens_list = examples['tokens'].values[start: end] tokens_infos = examples['attributes'].values[start: end] # TODO change to poses poses = examples['poses'].values[start: end] tags = examples['tags'].values[start: end] ners = examples['ners'].values[start: end] ori_tokens_list = examples['ori_tokens'].values[start: end] try: labels = examples[CLASSES].values[start: end] except Exception: labels = [[0.] * len(CLASSES)] * len(ids) with melt.tfrecords.Writer(out_file) as writer: for id, comment, label, comment_tokens, ori_tokens, tokens_info, pos, tag, ner in tqdm(zip(ids, comments, labels, tokens_list, ori_tokens_list, tokens_infos, poses, tags, ners)): if not isinstance(comment, str): comment = 'ok' comment_str = comment comment_tokens = comment_tokens.split(' ') tokens_info = tokens_info.split(' ') pos = pos.split(' ') tag = tag.split(' ') ner = ner.split(' ') ori_tokens = ori_tokens.split(' ') if FLAGS.comment_limit: comment_tokens = comment_tokens[:FLAGS.comment_limit] ori_tokens = ori_tokens[:FLAGS.comment_limit] tokens_info = tokens_info[:len(attribute_names) * FLAGS.comment_limit] pos_ids = [get_char_id(x, pos_vocab) for x in pos] tag_ids = [get_char_id(x, tag_vocab) for x in tag] ner_ids = [get_char_id(x, ner_vocab) for x in ner] # NOTICE comment_ids with vocab(all train + test word so no unk) if not FLAGS.lower: comment_ids = [get_id(token, vocab) for token in comment_tokens] #comment_ids_withunk = [get_id(token, unk_vocab) for token in comment_tokens] else: comment_ids = [get_id(token.lower(), vocab) for token in comment_tokens] #comment_ids_withunk = [get_id(token.lower(), unk_vocab) for token in comment_tokens] comment_tokens_str = '|'.join([vocab.key(id) for id in comment_ids]) label = list(map(float, label)) tokens_info = list(map(float, tokens_info)) #print(len(comment_ids), len(tokens_info) / len(attribute_names), len(tokens_info) / len(comment_ids)) assert len(tokens_info) == len(attribute_names) * len(comment_ids), '%d %f' %(len(comment_ids), len(tokens_info) / len(attribute_names)) #comment_chars = [list(token) for token in comment_tokens] ## CHANGE to use ori token so fu**ck will encode ** but NiggerMan to Nigger Man will all encode NiggerMan NiggerMan twice chars_list = [list(token) for token in ori_tokens] char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) assert len(comment_ids) == len(chars_list), '{} {} {} {} {}'.format((len(comment_ids), len(chars_list), comment), tokens, ori_tokens) for i, chars in enumerate(chars_list): for j, ch in enumerate(chars): if j == FLAGS.char_limit: break char_ids[i, j] = get_char_id(ch, char_vocab) char_ids = list(char_ids.reshape(-1)) #print(char_ids) # --------------simple char simple_char_ids = [] for ch in list(comment): id_ = get_char_id(ch, char_vocab) #if id_ == char_vocab.unk_id(): # continue simple_char_ids.append(id_) if len(simple_char_ids) == FLAGS.simple_char_limit: break simple_chars_str = ''.join([char_vocab.key(id) for id in simple_char_ids]) #print(simple_char_ids, simple_chars_str) # # --------------simple ngram # simple_ngrams = gezi.get_ngrams(comment) # simple_ngrams = simple_ngrams[:FLAGS.simple_char_limit * 5] # simple_ngram_ids = [get_ngram_id(ngram, ngram_vocab) for ngram in simple_ngrams] # --------------ngram ngram_ids_list = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32) if not FLAGS.ftngram: #ngrams_list = [gezi.get_ngrams(token) for token in ori_tokens] if not FLAGS.ngram_lower: ngrams_list = [gezi.get_ngrams(token, FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens] else: ngrams_list = [gezi.get_ngrams(token.lower(), FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens] for i, ngrams in enumerate(ngrams_list): for j, ngram in enumerate(ngrams): if j == FLAGS.char_limit: break #assert get_ngram_id(ngram, ngram_vocab) < 20003 ngram_ids_list[i, j] = get_ngram_id(ngram, ngram_vocab) else: #for i, (token, ori_token) in enumerate(zip(comment_tokens, ori_tokens)): for i, (token, ori_token) in enumerate(zip(comment_tokens, comment_tokens)): ngram_ids = gezi.fasttext_ids(ori_token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FLAGS.ngram_max) if len(ngram_ids) >= FLAGS.char_limit: ngram_ids = gezi.fasttext_ids(token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FALGS.ngram_max) ngram_ids = ngram_ids[:FLAGS.char_limit] for j, ngram_id in enumerate(ngram_ids): ngram_ids_list[i, j] = ngram_id ngram_ids = list(ngram_ids_list.reshape(-1)) # # ---------------fngrams(full ngrams) # fngrams_list = [gezi.get_ngrams_hash(token, FLAGS.ngram_buckets, 3, 6, reserve=3) for token in ori_tokens] # fngram_ids = np.zeros([len(comment_ids), FLAGS.ngram_limit], dtype=np.int32) # for i, fngrams in enumerate(fngrams_list): # for j, fngram in enumerate(fngrams): # if j == FLAGS.ngram_limit: # break # fngram_ids[i, j] = fngram # fngram_ids = list(fngram_ids.reshape(-1)) # global info per comment 7 features comment_info = [] comment_info.append(len(ori_tokens)) comment_info.append(len(comment_tokens)) #comment_len = sum[len(x) for x in ori_tokens] comment_len = len(comment_str) comment_info.append(comment_len) comment_info.append(comment_len / (len(ori_tokens) + 1)) num_unks = len([x for x in comment_ids if x == vocab.unk_id()]) comment_info.append(num_unks) comment_info.append(num_unks / len(comment_tokens)) comment_info.append(enprob_dict[id]) record = tf.train.Example(features=tf.train.Features(feature={ "comment": melt.int64_feature(comment_ids), #"comment_withunk": melt.int64_feature(comment_ids_withunk), "tokens_info": melt.float_feature(tokens_info), "comment_info": melt.float_feature(comment_info), "pos": melt.int64_feature(pos_ids), "tag": melt.int64_feature(tag_ids), "ner": melt.int64_feature(ner_ids), "comment_chars": melt.int64_feature(char_ids), "comment_ngrams": melt.int64_feature(ngram_ids), "simple_chars": melt.int64_feature(simple_char_ids), #"simple_ngrams": melt.int64_feature(simple_ngram_ids), #"comment_fngrams": melt.int64_feature(fngram_ids), #"simple_chars_str": melt.bytes_feature(simple_chars_str), "classes": melt.float_feature(label), "id": melt.bytes_feature(id), "weight": melt.float_feature([FLAGS.weight]), "comment_str": melt.bytes_feature(comment_str), "comment_tokens_str": melt.bytes_feature(comment_tokens_str) })) writer.write(record) global counter with counter.get_lock(): counter.value += 1 print("Build {} instances of features in total".format(writer.size())) writer.close()
def build_features(index): mode = get_mode(FLAGS.input) start_index = 0 if not FLAGS.use_fold else 1 out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format( mode, index + start_index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(df) num_records = FLAGS.num_records_ if mode in ['valid', 'test', 'dev', 'pm']: num_records = 1 start, end = gezi.get_fold(total, num_records, index) print('infile', FLAGS.input, 'out_file', out_file) max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for i in range(start, end): try: row = df.iloc[i] id = row[0] content = row[1] #print(content, type(content)) if len(content) > max_len: max_len = len(content) print('max_len', max_len) if len(content) > 3000: print(id, content) if mode not in ['test', 'valid']: continue label = list(row[2:]) #label = [x + 2 for x in label] #num_labels = len(label) content_ids = text2ids_(content) if len(content_ids) < 5 and mode not in ['test', 'valid']: continue limit = FLAGS.limit if len(content_ids) > max_num_ids: max_num_ids = len(content_ids) print('max_num_ids', max_num_ids) content_ids = content_ids[:limit] feature = { 'id': melt.bytes_feature(str(id)), 'label': melt.int64_feature(label), 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content), 'sorce': melt.bytes_feature(mode), } # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) if num % 1000 == 0: print(num) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: #print(traceback.format_exc(), file=sys.stderr) pass
def build_features(index): total = len(df) start, end = gezi.get_fold(total, FLAGS.num_records, index) df_ = df.iloc[start:end] num_records = 0 buffer_size = None if ( FLAGS.mark != 'train' or not FLAGS.shuffle_impressions) else FLAGS.shuffle_buffer_size ofile = f'{FLAGS.out_dir}/{FLAGS.mark}/record_{index}.TMP' folder_name = FLAGS.mark if FLAGS.neg_parts > 1: folder_name = f'{FLAGS.mark}-{FLAGS.neg_part}' os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}') ofile = f'{FLAGS.out_dir}/{FLAGS.mark}-{FLAGS.neg_part}/record_{index}.TMP' writer = melt.tfrecords.Writer(ofile, buffer_size=buffer_size) if FLAGS.mark == 'train' and FLAGS.train_by_day: # 2019 11 9 -> 11 14 num_days = 7 num_records_list = [0] * num_days ofiles = [] writers = [] for i in range(num_days): os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}-days/{i}') ofiles += [ f'{FLAGS.out_dir}/{folder_name}-days/{i}/record_{index}.TMP' ] writers += [ melt.tfrecords.Writer(ofiles[-1], buffer_size=buffer_size) ] for _, row in tqdm(df_.iterrows(), total=len(df_), ascii=True): time_ = row['time'] day = int(time_.split()[0].split('/')[1]) - 9 if FLAGS.day is not None and day != FLAGS.day: continue x = to_datetime(time_) weekday = x.weekday() hour = x.hour # timestamp = to_timestamp(x) timestamp = row['timestamp'] impressions = row['impressions'].split() impression_id = row['impression_id'] uid = uid_vocab.id(row['uid']) try: history = [ did_vocab.id(x) for x in reversed(row['history'].split()) ] except Exception: # print(row['history'], row['impression_id']) history = [] feature = {} feature['uid_'] = row['uid'] feature['uid'] = uid feature['day'] = day feature['weekday'] = weekday feature['hour'] = hour feature['impression_id'] = impression_id feature['uid_in_train'] = int(uid_vocab2.has(row['uid'])) feature['impression_len'] = len(impressions) feature['hist_len'] = len(history) feature['history'] = history if FLAGS.record_padded: feature['history'] = gezi.pad(feature['history'], FLAGS.max_history) else: feature['history'] = feature['history'][:FLAGS.max_history] if FLAGS.use_impressions: feature['impressions'] = [ did_vocab.id(x.split('-')[0]) for x in impressions ] # 当前doc的cat subcat # 当前doc的entities entity types feature['history_title_entities'] = [] feature['history_title_entity_types'] = [] feature['history_abstract_entities'] = [] feature['history_abstract_entity_types'] = [] for did in history: if did == 0: break did = did_vocab.key(did) news = news_info[did] try: title_entities = json.loads(news['title_entities']) for i, m in enumerate(title_entities): if i == 2: break entity = m['WikidataId'] feature['history_title_entities'] += [ entity_vocab.id(entity) ] feature['history_title_entity_types'] += [ entity_type_vocab.id(m['Type']) ] except Exception: pass try: abstract_entities = json.loads(news['abstract_entities']) for m in title_entities: if i == 2: break entity = m['WikidataId'] feature['history_abstract_entities'] += [ entity_vocab.id(entity) ] feature['history_abstract_entity_types'] += [ entity_type_vocab.id(m['Type']) ] except Exception: pass if FLAGS.record_padded: feature['history_title_entities'] = pad( feature['history_title_entities'], FLAGS.max_history * FLAGS.max_his_title_entities) feature['history_title_entity_types'] = gezi.pad( feature['history_title_entity_types'], FLAGS.max_history * FLAGS.max_his_title_entities) feature['history_abstract_entities'] = pad( feature['history_abstract_entities'], FLAGS.max_history * FLAGS.max_his_abstract_entities) feature['history_abstract_entity_types'] = pad( feature['history_abstract_entity_types'], FLAGS.max_history * FLAGS.max_his_abstract_entities) else: feature['history_title_entities'] = feature[ 'history_title_entities'][:FLAGS.max_history * FLAGS.max_his_title_entities] feature['history_title_entity_types'] = feature[ 'history_title_entity_types'][:FLAGS.max_history * FLAGS.max_his_title_entities] feature['history_abstract_entities'] = feature[ 'history_abstract_entities'][:FLAGS.max_history * FLAGS.max_his_abstract_entities] feature['history_abstract_entity_types'] = feature[ 'history_abstract_entity_types'][:FLAGS.max_history * FLAGS. max_his_abstract_entities] if FLAGS.neg_parts > 1: indexes = list(range(len(impressions))) np.random.shuffle(indexes) prev_cat, prev_sub_cat = X, X recall_cats, recall_sub_cats = defaultdict(int), defaultdict(int) for i, impression in enumerate(impressions): did_ = impression.split('-')[0] news = news_info[did_] cat, sub_cat = news['cat'], news['sub_cat'] recall_cats[cat] += 1 recall_sub_cats[sub_cat] += 1 for i, impression in enumerate(impressions): if '-' in impression: did_, click = impression.split('-') else: did_, click = impression, '0' click = int(click) if FLAGS.neg_parts > 1: if not click and indexes[i] % FLAGS.neg_parts != FLAGS.neg_part: continue start_timestamp = start_timestamps[did_] fresh = timestamp - start_timestamp did = did_vocab.id(did_) feature['fresh'] = fresh feature['did_in_train'] = int(did_vocab2.has(did_)) feature['click'] = click feature['did_'] = did_ feature['did'] = did feature['id'] = impression_id * 100 + i feature['position'] = i news = news_info[did_] feature['cat'] = cat_vocab.id(news['cat']) feature['sub_cat'] = scat_vocab.id(news['sub_cat']) feature['title_len'] = len(news['title'].split()) try: feature['abstract_len'] = len(news['abstract'].split()) except Exception: # Nan feature['abstract_len'] = 0 feature['title_entities'] = [] feature['title_entity_types'] = [] feature['abstract_entities'] = [] feature['abstract_entity_types'] = [] try: title_entities = json.loads(news['title_entities']) for m in title_entities: entity = m['WikidataId'] feature['title_entities'].append(entity_vocab.id(entity)) feature['title_entity_types'].append( entity_type_vocab.id(m['Type'])) except Exception: pass try: abstract_entities = json.loads(news['abstract_entities']) for m in title_entities: entity = m['WikidataId'] feature['abstract_entities'].append( entity_vocab.id(entity)) feature['abstract_entity_types'].append( entity_type_vocab.id(m['Type'])) except Exception: pass if FLAGS.record_padded: for key in ['title_entities', 'title_entity_types']: feature[key] = pad(feature[key], FLAGS.max_title_entities) for key in ['abstract_entities', 'abstract_entity_types']: feature[key] = pad(feature[key], FLAGS.max_abstract_entities) # feature['impression_prev_cat'] = prev_cat # feature['impression_prev_sub_cat'] = prev_sub_cat # prev_cat = cat_vocab.id(news['cat']) # prev_sub_cat = scat_vocab.id(news['sub_cat']) # feature['impression_cat_ratio'] = recall_cats[news['cat']] / len(impressions) # feature['impression_sub_cat_ratio'] = recall_sub_cats[news['sub_cat']] / len(impressions) if FLAGS.use_impressions: feature['impressions'] = feature['impressions'][ max(0, i - 5):min(len(impressions), i + 4)] if FLAGS.record_padded: feature['impressions'] = gezi.pad(feature['impressions'], FLAGS.max_impressions) feature_ = {} for key in feature: feature_[key] = feature[key] if isinstance(feature[key], list or tuple) and not feature[key]: feature_[key] = [X] for key in feature_: try: feature_[key] = melt.gen_feature(feature_[key]) except Exception: print(key, feature[key]) print(traceback.format_exc()) exit(0) record = tf.train.Example(features=tf.train.Features( feature=feature_)) if FLAGS.mark == 'train' and FLAGS.train_by_day: writer = writers[day] writer.write(record) if FLAGS.mark == 'train' and FLAGS.train_by_day: num_records_list[day] += 1 else: num_records += 1 if FLAGS.mark == 'train' and FLAGS.train_by_day: for i in range(num_days): writers[i].close() if num_records_list[i] == 0: os.system('rm -rf %s' % ofiles[i]) else: ofile2 = ofiles[i].replace('.TMP', f'.{num_records_list[i]}') os.system('mv %s %s' % (ofiles[i], ofile2)) else: writer.close() if num_records == 0: os.system('rm -rf %s' % ofile) else: ofile2 = ofile.replace('.TMP', f'.{num_records}') os.system('mv %s %s' % (ofile, ofile2))