예제 #1
0
def check_sents():
    type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt')
    type_sents_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents.txt.gz')
    output_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-filter.txt')
    type_vocab, type_id_dict = datautils.load_vocab_file(type_vocab_file)

    type_set = get_type_str_dict(type_vocab)

    f = gzip.open(type_sents_file, 'rt', encoding='utf-8')
    for i, line in enumerate(f):
        # print(line.strip())
        sent = line.strip()

        words = sent.split(' ')
        n_words = len(words)
        keep = False
        for j in range(n_words):
            cur_word = words[j].lower()
            if cur_word not in type_set:
                continue
            if ends_with(words, j - 1, ['a']) or ends_with(words, j - 1, ['the']) or ends_with(
                    words, j - 1, ['and', 'other']) or ends_with(words, j - 1, ['and', 'some', 'other']):
                keep = True
            if starts_with(words, j + 1, ['such', 'as']):
                keep = True
                # print(cur_word, '&', sent)
                # exit()

        if i > 10:
            break
    f.close()
예제 #2
0
    def __init__(self, type_vocab_file, word_vecs_file):
        self.type_vocab, self.type_id_dict = datautils.load_vocab_file(
            type_vocab_file)
        self.parent_type_ids_dict = fetutils.get_parent_type_ids_dict(
            self.type_id_dict)
        self.n_types = len(self.type_vocab)

        if word_vecs_file is not None:
            import config

            print('loading {} ...'.format(word_vecs_file), end=' ', flush=True)
            self.token_vocab, self.token_vecs = datautils.load_pickle_data(
                word_vecs_file)
            self.token_id_dict = {t: i for i, t in enumerate(self.token_vocab)}
            print('done', flush=True)
            self.zero_pad_token_id = self.token_id_dict[config.TOKEN_ZERO_PAD]
            self.mention_token_id = self.token_id_dict[config.TOKEN_MENTION]
            self.unknown_token_id = self.token_id_dict[config.TOKEN_UNK]
예제 #3
0
def blocks_from_webisa():
    import tensorflow as tf
    import inflect

    output_tfr_file = os.path.join(config.DATA_DIR, 'ultrafine/zoutput/webisa_full_uffilter.tfr')
    output_labels_file = os.path.join(config.DATA_DIR, 'ultrafine/zoutput/webisa_full_uffilter_labels.txt')
    wia_file = os.path.join(config.DATA_DIR, 'weakz/webisa_context_full.txt')
    uf_type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt')
    type_vocab, type_id_dict = datautils.load_vocab_file(uf_type_vocab_file)
    filter_types = {'person', 'people', 'man', 'thing', 'stuff', 'location', 'organization',
                    'men', 'things', 'locations', 'organizations'}

    inflect_eng = inflect.engine()
    all_type_terms_dict = {t.replace('_', ' '): t for t in type_vocab}
    for t in type_vocab:
        t = t.replace('_', ' ')
        tp = inflect_eng.plural(t)
        if tp not in all_type_terms_dict:
            all_type_terms_dict[tp] = t

    cnt, filter_cnt = 0, 0
    keep_cnt = 0
    f = open(wia_file, encoding='utf-8')
    foutl = open(output_labels_file, 'w', encoding='utf-8')
    with tf.io.TFRecordWriter(output_tfr_file) as file_writer:
        for i, line in enumerate(f):
            cnt += 1
            # print(line.strip())
            parts = line.strip().split('\t')
            hyp_term = parts[1].strip()
            label = all_type_terms_dict.get(hyp_term, None)
            if label is None:
                continue
            if hyp_term in filter_types:
                filter_cnt += 1
                continue

            # print(hyp_term, '*', parts[-1])
            keep_cnt += 1
            file_writer.write(tf.constant(parts[-1].strip()).numpy())
            foutl.write('{}\n'.format(label))
    f.close()
    foutl.close()
    print(keep_cnt, cnt)
예제 #4
0
파일: tftest.py 프로젝트: hldai/realmcp
def input_fn():
    import json
    from locbert import tokenization

    batch_size = 4
    data_file = '/data/hldai/data/ultrafine/uf_data/crowd/test.json'
    type_vocab_file = '/data/hldai/data/ultrafine/uf_data/ontology/types.txt'
    reader_module_path = '/data/hldai/data/realm_data/cc_news_pretrained/bert'
    vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt')
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)

    types, type_id_dict = datautils.load_vocab_file(type_vocab_file)
    n_types = len(types)

    # texts = ['He is a teacher.',
    #          'He teaches his students.',
    #          'He is a lawyer.']
    texts = list()

    all_labels = list()
    with open(data_file, encoding='utf-8') as f:
        for i, line in enumerate(f):
            x = json.loads(line)
            text = '{} {} {}'.format(
                ' '.join(x['left_context_token']), x['mention_span'], ' '.join(x['right_context_token']))
            # print(text)
            texts.append(text)
            labels = x['y_str']
            tids = [type_id_dict.get(t, -1) for t in labels]
            tids = [tid for tid in tids if tid > -1]
            # if i > 5:
            all_labels.append(tids)
            if len(texts) >= 8:
                break
    print(len(texts), 'texts')

    def tok_id_seq_gen():
        tok_id_seqs = list()
        y_vecs = list()
        for i, text in enumerate(texts):
            tokens = tokenizer.tokenize(text)
            # print(tokens)
            tokens_full = ['[CLS]'] + tokens + ['[SEP]']
            tok_id_seq = tokenizer.convert_tokens_to_ids(tokens_full)
            # tok_id_seq = np.array([len(text)], np.float32)
            tok_id_seqs.append(tok_id_seq)
            y_vecs.append(to_one_hot(all_labels[i], n_types))
            if len(tok_id_seqs) >= batch_size:
                # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs)
                tok_id_seq_batch = tf.ragged.constant(tok_id_seqs)
                # y_vecs_tensor = tf.concat(y_vecs)
                yield {'tok_id_seq_batch': tok_id_seq_batch,
                       # 'input_mask': input_mask,
                       'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs
                tok_id_seqs = list()
                y_vecs = list()
        if len(tok_id_seqs) > 0:
            # tok_id_seq_batch, input_mask = get_padded_bert_input(tok_id_seqs)
            # y_vecs_tensor = tf.concat(y_vecs)
            tok_id_seq_batch = tf.ragged.constant(tok_id_seqs)
            yield {'tok_id_seq_batch': tok_id_seq_batch,
                   # 'input_mask': input_mask,
                   'vals': np.random.uniform(-1, 1, (3, 5))}, y_vecs

    # for v in iter(tok_id_seq_gen()):
    #     print(v)
    dataset = tf.data.Dataset.from_generator(
        tok_id_seq_gen,
        output_signature=(
            {
                'tok_id_seq_batch': tf.RaggedTensorSpec(dtype=tf.int32, ragged_rank=1),
                # 'tok_id_seq_batch': tf.TensorSpec(shape=None, dtype=tf.int32),
                # 'input_mask': tf.TensorSpec(shape=None, dtype=tf.int32),
                'vals': tf.TensorSpec(shape=None, dtype=tf.float32)
            },
            tf.TensorSpec(shape=None, dtype=tf.float32)))

    return dataset
예제 #5
0
def train_fet(block_records_path, block_emb_file, block_labels_file, model_dir,
              mode, log_file_name):
    __setup_logging(log_file_name, mode == 'train')
    logging.info(block_records_path)
    logging.info(block_emb_file)
    logging.info(model_dir)
    logging.info(mode)
    # logfile = os.path.join(output_dir, 'log/realm_et.log')
    # logger = tf.get_logger()
    # # logger.setLevel('ERROR')
    # logger.setLevel('INFO')
    # logger.addHandler(logging.FileHandler(logfile, mode='a'))
    # logger.propagate = False

    # run_train()

    batch_size = 1
    retriever_beam_size = 5
    num_train_steps = 100000
    n_train_repeat = 100
    save_checkpoints_steps = 1000
    log_step_count_steps = 100
    tf_random_seed = 1355
    embedder_module_path = os.path.join(
        data_dir, 'realm_data/cc_news_pretrained/embedder')
    reader_module_path = os.path.join(data_dir,
                                      'realm_data/cc_news_pretrained/bert')
    vocab_file = os.path.join(reader_module_path, 'assets/vocab.txt')
    # model_dir = os.path.join(config.OUTPUT_DIR, 'tmp/tmpmodels')
    # model_dir = os.path.join(output_dir, 'etdmodels')
    type_vocab_file = os.path.join(config.DATA_DIR,
                                   'ultrafine/uf_data/ontology/types.txt')

    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)
    sep_tok_id = tokenizer.convert_tokens_to_ids(['[SEP]'])[0]
    print('sep token id', sep_tok_id)
    types, type_id_dict = datautils.load_vocab_file(type_vocab_file)
    n_types = len(types)

    params = {
        'lr': 1e-5,
        'batch_size': batch_size,
        'max_seq_len': 256,
        'bert_dim': 768,
        'retriever_beam_size': retriever_beam_size,
        'n_types': n_types,
        'sep_tok_id': sep_tok_id,
        'embedder_module_path': embedder_module_path,
        'reader_module_path': reader_module_path,
        'num_train_steps': num_train_steps,
        'train_log_steps': 100,
        'eval_log_steps': 500,
        'num_block_records': 2000000,
        'block_records_path': block_records_path,
    }

    assert batch_size == 1
    init_pre_load_data(block_emb_file, block_labels_file, type_id_dict)
    # print(pre_load_data['np_db'].shape)
    params['num_block_records'] = pre_load_data['np_db'].shape[0]
    # exit()
    input_data = InputData(batch_size, tokenizer, types, type_id_dict,
                           retriever_beam_size, n_train_repeat)

    model_fn_use = model_fn if block_labels_file is None else model_fn_zlabels

    run_config = tf.estimator.RunConfig(
        model_dir=model_dir,
        log_step_count_steps=log_step_count_steps,
        save_checkpoints_steps=save_checkpoints_steps,
        save_checkpoints_secs=None,
        tf_random_seed=tf_random_seed)
    estimator = tf.estimator.Estimator(config=run_config,
                                       model_fn=model_fn_use,
                                       params=params,
                                       model_dir=model_dir)
    # estimator.train(input_fn)
    # estimator.evaluate(input_fn)

    train_spec = tf.estimator.TrainSpec(input_fn=input_data.input_fn_train,
                                        max_steps=num_train_steps)
    eval_spec = tf.estimator.EvalSpec(
        name="default",
        input_fn=input_data.input_fn_test,
        # exporters=exporters,
        # start_delay_secs=FLAGS.eval_start_delay_secs,
        # throttle_secs=FLAGS.eval_throttle_secs
    )

    # estimator.evaluate(input_data.input_fn_test)
    if mode == 'train':
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    elif mode == 'predict':
        predict_results(estimator, input_data.input_fn_test)
예제 #6
0
def filter_not_noun_sents():
    output_types_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-s01-filter-types.txt')
    output_sents_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-s01-filter.txt')
    type_vocab_file = os.path.join(config.DATA_DIR, 'ultrafine/uf_data/ontology/types.txt')
    type_sents_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents.txt.gz')
    pos_tags_file = os.path.join(config.DATA_DIR, 'ultrafine/res/enwiki-20151002-type-sents-postag-s01.txt')

    type_vocab, type_id_dict = datautils.load_vocab_file(type_vocab_file)
    type_dict = get_type_str_dict(type_vocab)

    ends_with_words = [['a'], ['the'], ['and', 'other'], ['and', 'some', 'other'], ['and', 'any', 'other']]
    starts_with_words = ['such', 'as']
    sent_id = -1
    sent = None
    keep_cnt = 0
    f = gzip.open(type_sents_file, 'rt', encoding='utf-8')
    f_pos = open(pos_tags_file, encoding='utf-8')
    for i, line in enumerate(f_pos):
        pos_tags = line.strip().split(' ')
        cur_sent_id = int(pos_tags[0])
        pos_tags = pos_tags[1:]
        # print(line.strip())

        while sent_id < cur_sent_id:
            sent = next(f).strip()
            # sent = line.strip()
            sent_id += 1

        words = sent.split(' ')
        n_words = len(words)
        assert n_words == len(pos_tags)
        # print(words)
        # print(pos_tags)
        # print()
        keep = False
        for j in range(n_words):
            cur_word = words[j].lower()
            t = type_dict.get(cur_word, None)
            if t is None:
                continue
            if pos_tags[j] not in {'NN', 'NNP', 'NNPS'}:
                continue

            # if any(ends_with(words, j - 1, e_words) for e_words in ends_with_words):
            #     keep = True
            # if not keep and any(starts_with(words, j + 1, s_words) for s_words in starts_with_words):
            #     keep = True

            for e_words in ends_with_words:
                if ends_with(words, j - 1, e_words):
                    # print(cur_word, '&', sent)
                    keep = True
                    break

            if keep:
                break

            for s_words in starts_with_words:
                if starts_with(words, j + 1, s_words):
                    # print(cur_word, '&', sent)
                    keep = True
                    break

            if keep:
                # print(t, '&', sent)
                break

        #     # if ends_with(words, j - 1, ['a']) or ends_with(words, j - 1, ['the']) or ends_with(
        #     #         words, j - 1, ['and', 'other']) or ends_with(words, j - 1, ['and', 'some', 'other']):
        #     #     keep = True
        #     # if starts_with(words, j + 1, ['such', 'as']):
        #     #     keep = True
        #         # print(cur_word, '&', sent)
        #         # exit()

        if keep:
            keep_cnt += 1

        if i % 10000 == 0:
            print(i, keep_cnt)

        # if i > 1000:
        #     break
        # if i > 10:
        #     break
    f.close()
    f_pos.close()