示例#1
0
文件: runprep.py 项目: hldai/realmcp
def doc_texts_to_token_ids():
    max_seq_len = 128
    output_file = os.path.join(config.DATA_DIR, 'realm_data/blocks_tok_id_seqs.pkl')
    tfr_text_docs_file = os.path.join(config.DATA_DIR, 'realm_data/blocks.tfr')
    reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/bert'
    vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt')
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)

    blocks_dataset = tf.data.TFRecordDataset(tfr_text_docs_file, buffer_size=512 * 1024 * 1024)

    tok_id_seqs = list()
    for i, v in enumerate(blocks_dataset):
        # print(v)
        v = v.numpy()
        v = v.decode('utf-8')
        tokens = tokenizer.tokenize(v)
        # print(tokens)
        # print(len(tokens))
        token_ids = np.array(tokenizer.convert_tokens_to_ids(tokens), dtype=np.int32)
        # print(type(token_ids))
        if len(token_ids) > max_seq_len:
            token_ids = token_ids[:max_seq_len]
        tok_id_seqs.append(token_ids)
        # if i > 3:
        #     break
        if i % 10000 == 0:
            print(i)

    datautils.save_pickle_data(tok_id_seqs, output_file)
示例#2
0
    def __init__(self, vocab_path, do_lower_case):
        if isinstance(vocab_path, bytes):
            vocab_path = vocab_path.decode()

        self.vocab_path = vocab_path
        self.do_lower_case = do_lower_case

        self._base_tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)

        # Look up special tokens.
        self.cls_id, self.sep_id, self.mask_id = (
            self._base_tokenizer.convert_tokens_to_ids(
                ['[CLS]', '[SEP]', '[MASK]']))
示例#3
0
def train_fet_sess():
    # run_train()
    retriever_beam_size = 5
    embedder_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/embedder'
    reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/locbert'
    vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt')

    # tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(reader_module_path)
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)
    tokens = tokenizer.tokenize('He is a teacher.')
    print(tokens)
    tokens_full = ['[CLS]'] + tokens + ['[SEP]']
    print(tokenizer.convert_tokens_to_ids(tokens_full))

    token_ids = tf.constant([[101, 2002, 2003, 1037, 3836, 1012, 102]],
                            dtype=tf.int32)
    with tf.device("/cpu:0"):
        retriever_outputs = retrieve(token_ids, embedder_module_path, True,
                                     retriever_beam_size)

    # # mode = tf.estimator.ModeKeys.TRAIN
    # mode = tf.estimator.ModeKeys.PREDICT
    # reader_module = hub.Module(
    #     reader_module_path,
    #     tags={"train"} if mode == tf.estimator.ModeKeys.TRAIN else {},
    #     trainable=True)
    #
    # mask = tf.constant([[1, 1, 1, 1, 1, 1, 1]], dtype=tf.int32)
    # segment_ids = tf.constant([[0, 0, 0, 0, 0, 0, 0]], dtype=tf.int32)
    #
    # concat_outputs = reader_module(
    #     dict(
    #         input_ids=token_ids,
    #         input_mask=mask,
    #         segment_ids=segment_ids),
    #     signature="tokens",
    #     as_dict=True)
    #
    # concat_token_emb = concat_outputs["sequence_output"]
    # trainable_vars = tf.trainable_variables()
    sess = tf.compat.v1.Session()
    init = tf.compat.v1.global_variables_initializer()
    sess.run(init)
    # print(sess.run(concat_token_emb))
    print(sess.run(retriever_outputs))
    for i in tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES):
        print(i.name)  # i.name if you want just a name
示例#4
0
文件: debug.py 项目: hldai/realmcp
def tf_test():
    import tensorflow as tf
    import tensorflow_hub as hub
    from locbert import tokenization

    reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/locbert'
    vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt')

    # tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(reader_module_path)
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)
    tokens = tokenizer.tokenize('He is a teacher.')
    print(tokens)
    tokens_full = ['[CLS]'] + tokens + ['[SEP]']
    print(tokenizer.convert_tokens_to_ids(tokens_full))

    mode = tf.estimator.ModeKeys.TRAIN
    reader_module = hub.Module(
        reader_module_path,
        tags={"train"} if mode == tf.estimator.ModeKeys.TRAIN else {},
        trainable=True)

    token_ids = tf.constant([[101, 2002, 2003, 1037, 3836, 1012, 102]],
                            dtype=tf.int32)
    mask = tf.constant([[1, 1, 1, 1, 1, 1, 1]], dtype=tf.int32)
    segment_ids = tf.constant([[0, 0, 0, 0, 0, 0, 0]], dtype=tf.int32)

    concat_outputs = reader_module(dict(input_ids=token_ids,
                                        input_mask=mask,
                                        segment_ids=segment_ids),
                                   signature="tokens",
                                   as_dict=True)

    concat_token_emb = concat_outputs["sequence_output"]

    a = tf.constant(3)
    b = tf.constant(4)
    c = a + b
    sess = tf.compat.v1.Session()
    init = tf.compat.v1.global_variables_initializer()
    sess.run(init)
    # Evaluate the tensor `c`.
    # print(sess.run(concat_token_emb))  # prints 30.0
    print(sess.run(c))
示例#5
0
文件: tftest.py 项目: hldai/realmcp
def input_fn():
    import json
    from locbert import tokenization

    batch_size = 4
    data_file = '/data/hldai/data/ultrafine/uf_data/crowd/test.json'
    type_vocab_file = '/data/hldai/data/ultrafine/uf_data/ontology/types.txt'
    reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/bert'
    vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt')
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)

    types, type_id_dict = datautils.load_vocab_file(type_vocab_file)
    n_types = len(types)

    # texts = ['He is a teacher.',
    #          'He teaches his students.',
    #          'He is a lawyer.']
    texts = list()

    all_labels = list()
    with open(data_file, encoding='utf-8') as f:
        for i, line in enumerate(f):
            x = json.loads(line)
            text = '{} {} {}'.format(
                ' '.join(x['left_context_token']), x['mention_span'], ' '.join(x['right_context_token']))
            # print(text)
            texts.append(text)
            labels = x['y_str']
            tids = [type_id_dict.get(t, -1) for t in labels]
            tids = [tid for tid in tids if tid > -1]
            # if i > 5:
            all_labels.append(tids)
            if len(texts) >= 8:
                break
    print(len(texts), 'texts')

    def tok_id_seq_gen():
        tok_id_seqs = list()
        y_vecs = list()
        for i, text in enumerate(texts):
            tokens = tokenizer.tokenize(text)
            # print(tokens)
            tokens_full = ['[CLS]'] + tokens + ['[SEP]']
            tok_id_seq = tokenizer.convert_tokens_to_ids(tokens_full)
            # tok_id_seq = np.array([len(text)], np.float32)
            tok_id_seqs.append(tok_id_seq)
            y_vecs.append(to_one_hot(all_labels[i], n_types))
            if len(tok_id_seqs) >= batch_size:
                # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs)
                tok_id_seq_batch = tf.ragged.constant(tok_id_seqs)
                # y_vecs_tensor = tf.concat(y_vecs)
                yield {'tok_id_seq_batch': tok_id_seq_batch,
                       # 'input_mask': input_mask,
                       'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs
                tok_id_seqs = list()
                y_vecs = list()
        if len(tok_id_seqs) > 0:
            # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs)
            # y_vecs_tensor = tf.concat(y_vecs)
            tok_id_seq_batch = tf.ragged.constant(tok_id_seqs)
            yield {'tok_id_seq_batch': tok_id_seq_batch,
                   # 'input_mask': input_mask,
                   'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs

    # for v in iter(tok_id_seq_gen()):
    #     print(v)
    dataset = tf.data.Dataset.from_generator(
        tok_id_seq_gen,
        output_signature=(
            {
                'tok_id_seq_batch': tf.RaggedTensorSpec(dtype=tf.int32, ragged_rank=1),
                # 'tok_id_seq_batch': tf.TensorSpec(shape=None, dtype=tf.int32),
                # 'input_mask': tf.TensorSpec(shape=None, dtype=tf.int32),
                'vals': tf.TensorSpec(shape=None, dtype=tf.float32)
            },
            tf.TensorSpec(shape=None, dtype=tf.float32)))

    return dataset
示例#6
0
def train_fet(block_records_path, block_emb_file, block_labels_file, model_dir,
              mode, log_file_name):
    __setup_logging(log_file_name, mode == 'train')
    logging.info(block_records_path)
    logging.info(block_emb_file)
    logging.info(model_dir)
    logging.info(mode)
    # logfile = os.path.join(output_dir, 'log/realm_et.log')
    # logger = tf.get_logger()
    # # logger.setLevel('ERROR')
    # logger.setLevel('INFO')
    # logger.addHandler(logging.FileHandler(logfile, mode='a'))
    # logger.propagate = False

    # run_train()

    batch_size = 1
    retriever_beam_size = 5
    num_train_steps = 100000
    n_train_repeat = 100
    save_checkpoints_steps = 1000
    log_step_count_steps = 100
    tf_random_seed = 1355
    embedder_module_path = os.path.join(
        data_dir, 'realm_data/cc_news_pretrained/embedder')
    reader_module_path = os.path.join(data_dir,
                                      'realm_data/cc_news_pretrained/bert')
    vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt')
    # model_dir = os.path.join(config.OUTPUT_DIR, 'tmp/tmpmodels')
    # model_dir = os.path.join(output_dir, 'etdmodels')
    type_vocab_file = os.path.join(config.DATA_DIR,
                                   'ultrafine/uf_data/ontology/types.txt')

    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)
    sep_tok_id = tokenizer.convert_tokens_to_ids(['[SEP]'])[0]
    print('sep token id', sep_tok_id)
    types, type_id_dict = datautils.load_vocab_file(type_vocab_file)
    n_types = len(types)

    params = {
        'lr': 1e-5,
        'batch_size': batch_size,
        'max_seq_len': 256,
        'bert_dim': 768,
        'retriever_beam_size': retriever_beam_size,
        'n_types': n_types,
        'sep_tok_id': sep_tok_id,
        'embedder_module_path': embedder_module_path,
        'reader_module_path': reader_module_path,
        'num_train_steps': num_train_steps,
        'train_log_steps': 100,
        'eval_log_steps': 500,
        'num_block_records': 2000000,
        'block_records_path': block_records_path,
    }

    assert batch_size == 1
    init_pre_load_data(block_emb_file, block_labels_file, type_id_dict)
    # print(pre_load_data['np_db'].shape)
    params['num_block_records'] = pre_load_data['np_db'].shape[0]
    # exit()
    input_data = InputData(batch_size, tokenizer, types, type_id_dict,
                           retriever_beam_size, n_train_repeat)

    model_fn_use = model_fn if block_labels_file is None else model_fn_zlabels

    run_config = tf.estimator.RunConfig(
        model_dir=model_dir,
        log_step_count_steps=log_step_count_steps,
        save_checkpoints_steps=save_checkpoints_steps,
        save_checkpoints_secs=None,
        tf_random_seed=tf_random_seed)
    estimator = tf.estimator.Estimator(config=run_config,
                                       model_fn=model_fn_use,
                                       params=params,
                                       model_dir=model_dir)
    # estimator.train(input_fn)
    # estimator.evaluate(input_fn)

    train_spec = tf.estimator.TrainSpec(input_fn=input_data.input_fn_train,
                                        max_steps=num_train_steps)
    eval_spec = tf.estimator.EvalSpec(
        name="default",
        input_fn=input_data.input_fn_test,
        # exporters=exporters,
        # start_delay_secs=FLAGS.eval_start_delay_secs,
        # throttle_secs=FLAGS.eval_throttle_secs
    )

    # estimator.evaluate(input_data.input_fn_test)
    if mode == 'train':
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    elif mode == 'predict':
        predict_results(estimator, input_data.input_fn_test)
示例#7
0
def get_tokenizer(module_handle):
    tokenization_info = get_tokenization_info(module_handle)
    return tokenization.FullTokenizer(
        vocab_file=tokenization_info["vocab_file"],
        do_lower_case=tokenization_info["do_lower_case"])