def doc_texts_to_token_ids(): max_seq_len = 128 output_file = os.path.join(config.DATA_DIR, 'realm_data/blocks_tok_id_seqs.pkl') tfr_text_docs_file = os.path.join(config.DATA_DIR, 'realm_data/blocks.tfr') reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/bert' vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt') tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True) blocks_dataset = tf.data.TFRecordDataset(tfr_text_docs_file, buffer_size=512 * 1024 * 1024) tok_id_seqs = list() for i, v in enumerate(blocks_dataset): # print(v) v = v.numpy() v = v.decode('utf-8') tokens = tokenizer.tokenize(v) # print(tokens) # print(len(tokens)) token_ids = np.array(tokenizer.convert_tokens_to_ids(tokens), dtype=np.int32) # print(type(token_ids)) if len(token_ids) > max_seq_len: token_ids = token_ids[:max_seq_len] tok_id_seqs.append(token_ids) # if i > 3: # break if i % 10000 == 0: print(i) datautils.save_pickle_data(tok_id_seqs, output_file)
def __init__(self, vocab_path, do_lower_case): if isinstance(vocab_path, bytes): vocab_path = vocab_path.decode() self.vocab_path = vocab_path self.do_lower_case = do_lower_case self._base_tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) # Look up special tokens. self.cls_id, self.sep_id, self.mask_id = ( self._base_tokenizer.convert_tokens_to_ids( ['[CLS]', '[SEP]', '[MASK]']))
def train_fet_sess(): # run_train() retriever_beam_size = 5 embedder_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/embedder' reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/locbert' vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt') # tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(reader_module_path) tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True) tokens = tokenizer.tokenize('He is a teacher.') print(tokens) tokens_full = ['[CLS]'] + tokens + ['[SEP]'] print(tokenizer.convert_tokens_to_ids(tokens_full)) token_ids = tf.constant([[101, 2002, 2003, 1037, 3836, 1012, 102]], dtype=tf.int32) with tf.device("/cpu:0"): retriever_outputs = retrieve(token_ids, embedder_module_path, True, retriever_beam_size) # # mode = tf.estimator.ModeKeys.TRAIN # mode = tf.estimator.ModeKeys.PREDICT # reader_module = hub.Module( # reader_module_path, # tags={"train"} if mode == tf.estimator.ModeKeys.TRAIN else {}, # trainable=True) # # mask = tf.constant([[1, 1, 1, 1, 1, 1, 1]], dtype=tf.int32) # segment_ids = tf.constant([[0, 0, 0, 0, 0, 0, 0]], dtype=tf.int32) # # concat_outputs = reader_module( # dict( # input_ids=token_ids, # input_mask=mask, # segment_ids=segment_ids), # signature="tokens", # as_dict=True) # # concat_token_emb = concat_outputs["sequence_output"] # trainable_vars = tf.trainable_variables() sess = tf.compat.v1.Session() init = tf.compat.v1.global_variables_initializer() sess.run(init) # print(sess.run(concat_token_emb)) print(sess.run(retriever_outputs)) for i in tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES): print(i.name) # i.name if you want just a name
def tf_test(): import tensorflow as tf import tensorflow_hub as hub from locbert import tokenization reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/locbert' vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt') # tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(reader_module_path) tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True) tokens = tokenizer.tokenize('He is a teacher.') print(tokens) tokens_full = ['[CLS]'] + tokens + ['[SEP]'] print(tokenizer.convert_tokens_to_ids(tokens_full)) mode = tf.estimator.ModeKeys.TRAIN reader_module = hub.Module( reader_module_path, tags={"train"} if mode == tf.estimator.ModeKeys.TRAIN else {}, trainable=True) token_ids = tf.constant([[101, 2002, 2003, 1037, 3836, 1012, 102]], dtype=tf.int32) mask = tf.constant([[1, 1, 1, 1, 1, 1, 1]], dtype=tf.int32) segment_ids = tf.constant([[0, 0, 0, 0, 0, 0, 0]], dtype=tf.int32) concat_outputs = reader_module(dict(input_ids=token_ids, input_mask=mask, segment_ids=segment_ids), signature="tokens", as_dict=True) concat_token_emb = concat_outputs["sequence_output"] a = tf.constant(3) b = tf.constant(4) c = a + b sess = tf.compat.v1.Session() init = tf.compat.v1.global_variables_initializer() sess.run(init) # Evaluate the tensor `c`. # print(sess.run(concat_token_emb)) # prints 30.0 print(sess.run(c))
def input_fn(): import json from locbert import tokenization batch_size = 4 data_file = '/data/hldai/data/ultrafine/uf_data/crowd/test.json' type_vocab_file = '/data/hldai/data/ultrafine/uf_data/ontology/types.txt' reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/bert' vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt') tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True) types, type_id_dict = datautils.load_vocab_file(type_vocab_file) n_types = len(types) # texts = ['He is a teacher.', # 'He teaches his students.', # 'He is a lawyer.'] texts = list() all_labels = list() with open(data_file, encoding='utf-8') as f: for i, line in enumerate(f): x = json.loads(line) text = '{} {} {}'.format( ' '.join(x['left_context_token']), x['mention_span'], ' '.join(x['right_context_token'])) # print(text) texts.append(text) labels = x['y_str'] tids = [type_id_dict.get(t, -1) for t in labels] tids = [tid for tid in tids if tid > -1] # if i > 5: all_labels.append(tids) if len(texts) >= 8: break print(len(texts), 'texts') def tok_id_seq_gen(): tok_id_seqs = list() y_vecs = list() for i, text in enumerate(texts): tokens = tokenizer.tokenize(text) # print(tokens) tokens_full = ['[CLS]'] + tokens + ['[SEP]'] tok_id_seq = tokenizer.convert_tokens_to_ids(tokens_full) # tok_id_seq = np.array([len(text)], np.float32) tok_id_seqs.append(tok_id_seq) y_vecs.append(to_one_hot(all_labels[i], n_types)) if len(tok_id_seqs) >= batch_size: # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs) tok_id_seq_batch = tf.ragged.constant(tok_id_seqs) # y_vecs_tensor = tf.concat(y_vecs) yield {'tok_id_seq_batch': tok_id_seq_batch, # 'input_mask': input_mask, 'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs tok_id_seqs = list() y_vecs = list() if len(tok_id_seqs) > 0: # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs) # y_vecs_tensor = tf.concat(y_vecs) tok_id_seq_batch = tf.ragged.constant(tok_id_seqs) yield {'tok_id_seq_batch': tok_id_seq_batch, # 'input_mask': input_mask, 'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs # for v in iter(tok_id_seq_gen()): # print(v) dataset = tf.data.Dataset.from_generator( tok_id_seq_gen, output_signature=( { 'tok_id_seq_batch': tf.RaggedTensorSpec(dtype=tf.int32, ragged_rank=1), # 'tok_id_seq_batch': tf.TensorSpec(shape=None, dtype=tf.int32), # 'input_mask': tf.TensorSpec(shape=None, dtype=tf.int32), 'vals': tf.TensorSpec(shape=None, dtype=tf.float32) }, tf.TensorSpec(shape=None, dtype=tf.float32))) return dataset
def train_fet(block_records_path, block_emb_file, block_labels_file, model_dir, mode, log_file_name): __setup_logging(log_file_name, mode == 'train') logging.info(block_records_path) logging.info(block_emb_file) logging.info(model_dir) logging.info(mode) # logfile = os.path.join(output_dir, 'log/realm_et.log') # logger = tf.get_logger() # # logger.setLevel('ERROR') # logger.setLevel('INFO') # logger.addHandler(logging.FileHandler(logfile, mode='a')) # logger.propagate = False # run_train() batch_size = 1 retriever_beam_size = 5 num_train_steps = 100000 n_train_repeat = 100 save_checkpoints_steps = 1000 log_step_count_steps = 100 tf_random_seed = 1355 embedder_module_path = os.path.join( data_dir, 'realm_data/cc_news_pretrained/embedder') reader_module_path = os.path.join(data_dir, 'realm_data/cc_news_pretrained/bert') vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt') # model_dir = os.path.join(config.OUTPUT_DIR, 'tmp/tmpmodels') # model_dir = os.path.join(output_dir, 'etdmodels') type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt') tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True) sep_tok_id = tokenizer.convert_tokens_to_ids(['[SEP]'])[0] print('sep token id', sep_tok_id) types, type_id_dict = datautils.load_vocab_file(type_vocab_file) n_types = len(types) params = { 'lr': 1e-5, 'batch_size': batch_size, 'max_seq_len': 256, 'bert_dim': 768, 'retriever_beam_size': retriever_beam_size, 'n_types': n_types, 'sep_tok_id': sep_tok_id, 'embedder_module_path': embedder_module_path, 'reader_module_path': reader_module_path, 'num_train_steps': num_train_steps, 'train_log_steps': 100, 'eval_log_steps': 500, 'num_block_records': 2000000, 'block_records_path': block_records_path, } assert batch_size == 1 init_pre_load_data(block_emb_file, block_labels_file, type_id_dict) # print(pre_load_data['np_db'].shape) params['num_block_records'] = pre_load_data['np_db'].shape[0] # exit() input_data = InputData(batch_size, tokenizer, types, type_id_dict, retriever_beam_size, n_train_repeat) model_fn_use = model_fn if block_labels_file is None else model_fn_zlabels run_config = tf.estimator.RunConfig( model_dir=model_dir, log_step_count_steps=log_step_count_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, tf_random_seed=tf_random_seed) estimator = tf.estimator.Estimator(config=run_config, model_fn=model_fn_use, params=params, model_dir=model_dir) # estimator.train(input_fn) # estimator.evaluate(input_fn) train_spec = tf.estimator.TrainSpec(input_fn=input_data.input_fn_train, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec( name="default", input_fn=input_data.input_fn_test, # exporters=exporters, # start_delay_secs=FLAGS.eval_start_delay_secs, # throttle_secs=FLAGS.eval_throttle_secs ) # estimator.evaluate(input_data.input_fn_test) if mode == 'train': tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) elif mode == 'predict': predict_results(estimator, input_data.input_fn_test)
def get_tokenizer(module_handle): tokenization_info = get_tokenization_info(module_handle) return tokenization.FullTokenizer( vocab_file=tokenization_info["vocab_file"], do_lower_case=tokenization_info["do_lower_case"])