def check_sents(): type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt') type_sents_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents.txt.gz') output_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-filter.txt') type_vocab, type_id_dict = datautils.load_vocab_file(type_vocab_file) type_set = get_type_str_dict(type_vocab) f = gzip.open(type_sents_file, 'rt', encoding='utf-8') for i, line in enumerate(f): # print(line.strip()) sent = line.strip() words = sent.split(' ') n_words = len(words) keep = False for j in range(n_words): cur_word = words[j].lower() if cur_word not in type_set: continue if ends_with(words, j - 1, ['a']) or ends_with(words, j - 1, ['the']) or ends_with( words, j - 1, ['and', 'other']) or ends_with(words, j - 1, ['and', 'some', 'other']): keep = True if starts_with(words, j + 1, ['such', 'as']): keep = True # print(cur_word, '&', sent) # exit() if i > 10: break f.close()
def __init__(self, type_vocab_file, word_vecs_file): self.type_vocab, self.type_id_dict = datautils.load_vocab_file( type_vocab_file) self.parent_type_ids_dict = fetutils.get_parent_type_ids_dict( self.type_id_dict) self.n_types = len(self.type_vocab) if word_vecs_file is not None: import config print('loading {} ...'.format(word_vecs_file), end=' ', flush=True) self.token_vocab, self.token_vecs = datautils.load_pickle_data( word_vecs_file) self.token_id_dict = {t: i for i, t in enumerate(self.token_vocab)} print('done', flush=True) self.zero_pad_token_id = self.token_id_dict[config.TOKEN_ZERO_PAD] self.mention_token_id = self.token_id_dict[config.TOKEN_MENTION] self.unknown_token_id = self.token_id_dict[config.TOKEN_UNK]
def blocks_from_webisa(): import tensorflow as tf import inflect output_tfr_file = os.path.join(config.DATA_DIR, 'ultrafine/zoutput/webisa_full_uffilter.tfr') output_labels_file = os.path.join(config.DATA_DIR, 'ultrafine/zoutput/webisa_full_uffilter_labels.txt') wia_file = os.path.join(config.DATA_DIR, 'weakz/webisa_context_full.txt') uf_type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt') type_vocab, type_id_dict = datautils.load_vocab_file(uf_type_vocab_file) filter_types = {'person', 'people', 'man', 'thing', 'stuff', 'location', 'organization', 'men', 'things', 'locations', 'organizations'} inflect_eng = inflect.engine() all_type_terms_dict = {t.replace('_', ' '): t for t in type_vocab} for t in type_vocab: t = t.replace('_', ' ') tp = inflect_eng.plural(t) if tp not in all_type_terms_dict: all_type_terms_dict[tp] = t cnt, filter_cnt = 0, 0 keep_cnt = 0 f = open(wia_file, encoding='utf-8') foutl = open(output_labels_file, 'w', encoding='utf-8') with tf.io.TFRecordWriter(output_tfr_file) as file_writer: for i, line in enumerate(f): cnt += 1 # print(line.strip()) parts = line.strip().split('\t') hyp_term = parts[1].strip() label = all_type_terms_dict.get(hyp_term, None) if label is None: continue if hyp_term in filter_types: filter_cnt += 1 continue # print(hyp_term, '*', parts[-1]) keep_cnt += 1 file_writer.write(tf.constant(parts[-1].strip()).numpy()) foutl.write('{}\n'.format(label)) f.close() foutl.close() print(keep_cnt, cnt)
def input_fn(): import json from locbert import tokenization batch_size = 4 data_file = '/data/hldai/data/ultrafine/uf_data/crowd/test.json' type_vocab_file = '/data/hldai/data/ultrafine/uf_data/ontology/types.txt' reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/bert' vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt') tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True) types, type_id_dict = datautils.load_vocab_file(type_vocab_file) n_types = len(types) # texts = ['He is a teacher.', # 'He teaches his students.', # 'He is a lawyer.'] texts = list() all_labels = list() with open(data_file, encoding='utf-8') as f: for i, line in enumerate(f): x = json.loads(line) text = '{} {} {}'.format( ' '.join(x['left_context_token']), x['mention_span'], ' '.join(x['right_context_token'])) # print(text) texts.append(text) labels = x['y_str'] tids = [type_id_dict.get(t, -1) for t in labels] tids = [tid for tid in tids if tid > -1] # if i > 5: all_labels.append(tids) if len(texts) >= 8: break print(len(texts), 'texts') def tok_id_seq_gen(): tok_id_seqs = list() y_vecs = list() for i, text in enumerate(texts): tokens = tokenizer.tokenize(text) # print(tokens) tokens_full = ['[CLS]'] + tokens + ['[SEP]'] tok_id_seq = tokenizer.convert_tokens_to_ids(tokens_full) # tok_id_seq = np.array([len(text)], np.float32) tok_id_seqs.append(tok_id_seq) y_vecs.append(to_one_hot(all_labels[i], n_types)) if len(tok_id_seqs) >= batch_size: # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs) tok_id_seq_batch = tf.ragged.constant(tok_id_seqs) # y_vecs_tensor = tf.concat(y_vecs) yield {'tok_id_seq_batch': tok_id_seq_batch, # 'input_mask': input_mask, 'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs tok_id_seqs = list() y_vecs = list() if len(tok_id_seqs) > 0: # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs) # y_vecs_tensor = tf.concat(y_vecs) tok_id_seq_batch = tf.ragged.constant(tok_id_seqs) yield {'tok_id_seq_batch': tok_id_seq_batch, # 'input_mask': input_mask, 'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs # for v in iter(tok_id_seq_gen()): # print(v) dataset = tf.data.Dataset.from_generator( tok_id_seq_gen, output_signature=( { 'tok_id_seq_batch': tf.RaggedTensorSpec(dtype=tf.int32, ragged_rank=1), # 'tok_id_seq_batch': tf.TensorSpec(shape=None, dtype=tf.int32), # 'input_mask': tf.TensorSpec(shape=None, dtype=tf.int32), 'vals': tf.TensorSpec(shape=None, dtype=tf.float32) }, tf.TensorSpec(shape=None, dtype=tf.float32))) return dataset
def train_fet(block_records_path, block_emb_file, block_labels_file, model_dir, mode, log_file_name): __setup_logging(log_file_name, mode == 'train') logging.info(block_records_path) logging.info(block_emb_file) logging.info(model_dir) logging.info(mode) # logfile = os.path.join(output_dir, 'log/realm_et.log') # logger = tf.get_logger() # # logger.setLevel('ERROR') # logger.setLevel('INFO') # logger.addHandler(logging.FileHandler(logfile, mode='a')) # logger.propagate = False # run_train() batch_size = 1 retriever_beam_size = 5 num_train_steps = 100000 n_train_repeat = 100 save_checkpoints_steps = 1000 log_step_count_steps = 100 tf_random_seed = 1355 embedder_module_path = os.path.join( data_dir, 'realm_data/cc_news_pretrained/embedder') reader_module_path = os.path.join(data_dir, 'realm_data/cc_news_pretrained/bert') vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt') # model_dir = os.path.join(config.OUTPUT_DIR, 'tmp/tmpmodels') # model_dir = os.path.join(output_dir, 'etdmodels') type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt') tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True) sep_tok_id = tokenizer.convert_tokens_to_ids(['[SEP]'])[0] print('sep token id', sep_tok_id) types, type_id_dict = datautils.load_vocab_file(type_vocab_file) n_types = len(types) params = { 'lr': 1e-5, 'batch_size': batch_size, 'max_seq_len': 256, 'bert_dim': 768, 'retriever_beam_size': retriever_beam_size, 'n_types': n_types, 'sep_tok_id': sep_tok_id, 'embedder_module_path': embedder_module_path, 'reader_module_path': reader_module_path, 'num_train_steps': num_train_steps, 'train_log_steps': 100, 'eval_log_steps': 500, 'num_block_records': 2000000, 'block_records_path': block_records_path, } assert batch_size == 1 init_pre_load_data(block_emb_file, block_labels_file, type_id_dict) # print(pre_load_data['np_db'].shape) params['num_block_records'] = pre_load_data['np_db'].shape[0] # exit() input_data = InputData(batch_size, tokenizer, types, type_id_dict, retriever_beam_size, n_train_repeat) model_fn_use = model_fn if block_labels_file is None else model_fn_zlabels run_config = tf.estimator.RunConfig( model_dir=model_dir, log_step_count_steps=log_step_count_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, tf_random_seed=tf_random_seed) estimator = tf.estimator.Estimator(config=run_config, model_fn=model_fn_use, params=params, model_dir=model_dir) # estimator.train(input_fn) # estimator.evaluate(input_fn) train_spec = tf.estimator.TrainSpec(input_fn=input_data.input_fn_train, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec( name="default", input_fn=input_data.input_fn_test, # exporters=exporters, # start_delay_secs=FLAGS.eval_start_delay_secs, # throttle_secs=FLAGS.eval_throttle_secs ) # estimator.evaluate(input_data.input_fn_test) if mode == 'train': tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) elif mode == 'predict': predict_results(estimator, input_data.input_fn_test)
def filter_not_noun_sents(): output_types_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-s01-filter-types.txt') output_sents_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-s01-filter.txt') type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt') type_sents_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents.txt.gz') pos_tags_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-postag-s01.txt') type_vocab, type_id_dict = datautils.load_vocab_file(type_vocab_file) type_dict = get_type_str_dict(type_vocab) ends_with_words = [['a'], ['the'], ['and', 'other'], ['and', 'some', 'other'], ['and', 'any', 'other']] starts_with_words = ['such', 'as'] sent_id = -1 sent = None keep_cnt = 0 f = gzip.open(type_sents_file, 'rt', encoding='utf-8') f_pos = open(pos_tags_file, encoding='utf-8') for i, line in enumerate(f_pos): pos_tags = line.strip().split(' ') cur_sent_id = int(pos_tags[0]) pos_tags = pos_tags[1:] # print(line.strip()) while sent_id < cur_sent_id: sent = next(f).strip() # sent = line.strip() sent_id += 1 words = sent.split(' ') n_words = len(words) assert n_words == len(pos_tags) # print(words) # print(pos_tags) # print() keep = False for j in range(n_words): cur_word = words[j].lower() t = type_dict.get(cur_word, None) if t is None: continue if pos_tags[j] not in {'NN', 'NNP', 'NNPS'}: continue # if any(ends_with(words, j - 1, e_words) for e_words in ends_with_words): # keep = True # if not keep and any(starts_with(words, j + 1, s_words) for s_words in starts_with_words): # keep = True for e_words in ends_with_words: if ends_with(words, j - 1, e_words): # print(cur_word, '&', sent) keep = True break if keep: break for s_words in starts_with_words: if starts_with(words, j + 1, s_words): # print(cur_word, '&', sent) keep = True break if keep: # print(t, '&', sent) break # # if ends_with(words, j - 1, ['a']) or ends_with(words, j - 1, ['the']) or ends_with( # # words, j - 1, ['and', 'other']) or ends_with(words, j - 1, ['and', 'some', 'other']): # # keep = True # # if starts_with(words, j + 1, ['such', 'as']): # # keep = True # # print(cur_word, '&', sent) # # exit() if keep: keep_cnt += 1 if i % 10000 == 0: print(i, keep_cnt) # if i > 1000: # break # if i > 10: # break f.close() f_pos.close()