Exemplo n.º 1
0
def bulid_dataset(args, reader, vocab, debug=False):
    char2idx, char_emb = None, None
    train_src = args.input + "/train_data.json"
    dev_src = args.input + "/dev_data.json"

    train_examples_file = args.cache_data + "/train-examples.pkl"
    dev_examples_file = args.cache_data + "/dev-examples.pkl"

    char_emb_file = args.cache_data + "/char_emb.pkl"
    char_dictionary = args.cache_data + "/char_dict.pkl"

    if not os.path.exists(train_examples_file):

        train_examples = reader.read_examples(train_src, data_type='train')
        dev_examples = reader.read_examples(dev_src, data_type='dev')

        if not args.use_bert:
            # todo : min_word_count=3 ?
            vocab.build_vocab_only_with_char(train_examples, min_char_count=1)
            if args.use_word2vec and args.embedding_file:
                char_emb = vocab.make_embedding(
                    vocab=vocab.char_vocab,
                    embedding_file=args.embedding_file,
                    emb_size=args.word_emb_size)
                save(char_emb_file, char_emb, message="char embedding")
            save(char_dictionary, vocab.char2idx, message="char dictionary")
            char2idx = vocab.char2idx
        save(train_examples_file, train_examples, message="train examples")
        save(dev_examples_file, dev_examples, message="dev examples")
    else:
        if not args.use_bert:
            if args.use_word2vec and args.embedding_file:
                char_emb = load(char_emb_file)
            char2idx = load(char_dictionary)
            logging.info("total char vocabulary size is {} ".format(
                len(char2idx)))
        train_examples, dev_examples = load(train_examples_file), load(
            dev_examples_file)

        logging.info('train examples size is {}'.format(len(train_examples)))
        logging.info('dev examples size is {}'.format(len(dev_examples)))

    if not args.use_bert:
        args.vocab_size = len(char2idx)
    convert_examples_features = Feature(args, token2idx_dict=char2idx)

    train_examples = train_examples[:2] if debug else train_examples
    dev_examples = dev_examples[:2] if debug else dev_examples

    train_data_set = convert_examples_features(train_examples,
                                               data_type='train')
    dev_data_set = convert_examples_features(dev_examples, data_type='dev')
    train_data_loader = train_data_set.get_dataloader(
        args.train_batch_size, shuffle=True, pin_memory=args.pin_memory)
    dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size)

    data_loaders = train_data_loader, dev_data_loader
    eval_examples = train_examples, dev_examples

    return eval_examples, data_loaders, char_emb
Exemplo n.º 2
0
def bulid_dataset(args, spo_config, reader,tokenizer, debug=False):
    train_src = args.input + "/train_data.json"
    dev_src = args.input + "/test2_data.json"


    train_examples_file = args.cache_data + "/train-examples.pkl"
    dev_examples_file = args.cache_data + "/dev-examples.pkl"

    if not os.path.exists(train_examples_file):
        train_examples = reader.read_examples(train_src, data_type='train')
        dev_examples = reader.read_examples(dev_src, data_type='dev')
        save(train_examples_file, train_examples, message="train examples")
        save(dev_examples_file, dev_examples, message="dev examples")
    else:
        logging.info('loading train cache_data {}'.format(train_examples_file))
        logging.info('loading dev cache_data {}'.format(dev_examples_file))
        train_examples, dev_examples = load(train_examples_file), load(dev_examples_file)

        logging.info('train examples size is {}'.format(len(train_examples)))
        logging.info('dev examples size is {}'.format(len(dev_examples)))

    convert_examples_features = Feature(max_len=args.max_len, spo_config=spo_config, tokenizer=tokenizer)

    train_examples = train_examples[:2] if debug else train_examples
    dev_examples = dev_examples[:2] if debug else dev_examples

    train_data_set = convert_examples_features(train_examples, data_type='train')
    dev_data_set = convert_examples_features(dev_examples, data_type='dev')
    train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory)
    dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size)

    data_loaders = train_data_loader, dev_data_loader
    eval_examples = train_examples, dev_examples

    return eval_examples, data_loaders, tokenizer
Exemplo n.º 3
0
def update_entity_details(folder_name, file_regex, output_path):
    file_names = file_util.get_file_name_in_dir_regex(folder_name, file_regex)
    link_data = {}
    parent_of_leaf = []
    all_entities_from_mention = {}
    for file_name in file_names:
        print("file_name", file_name)
        entity_dict = file_util.load(file_name)
        # print(entity_dict)
        for entity_id in entity_dict:
            all_entities_from_mention[entity_id] = entity_dict[entity_id]
            linkto_infos = entity_dict[entity_id]["parents"]
            for linkto_info in linkto_infos:
                source_id = linkto_info['id']
                dest_id = linkto_info['link_to']
                if source_id == entity_id:
                    parent_of_leaf.append(dest_id)
                else:
                    parent_of_leaf.append(source_id)
                    parent_of_leaf.append(dest_id)
                link_data[source_id] = link_data.get(source_id, [])
                link_data[dest_id] = link_data.get(dest_id, [])
                if dest_id not in link_data[source_id] and dest_id != '':
                    link_data[source_id].append(dest_id)
    file_util.dump(link_data,
                   output_path + ".pck")  # "iteration3_data_dumped.pck"
    file_util.dump(parent_of_leaf, output_path + "_parent_leaf.pck")
    file_util.dump_json(link_data, output_path + ".json")
    des_short_name_dict = update_entity_description_shortname(
        link_data, all_entities_from_mention)
    file_util.dump_json(des_short_name_dict, output_path + "_brief.json")
    wiki_graph_util.convert_to_tree(link_data, des_short_name_dict)
    file_util.dump_json(all_entities_from_mention,
                        output_path + "_patent_entity_relations.json")
    excel_tree_level_export.demo(file_util.load_json("all_entity_level.json"))
Exemplo n.º 4
0
def bulid_dataset(args, debug=False):
    train_src = args.input + "/train.txt"
    dev_src = args.input + "/dev.txt"
    cache_data_file = args.cache_data + "/cache_data.pkl"
    cache_data = {}
    if not os.path.exists(cache_data_file):

        reader = Reader(bi_char=args.bi_char)
        train_examples = reader.read_examples(train_src, data_type='train')
        dev_examples = reader.read_examples(dev_src, data_type='dev')

        char_vocab = Vocabulary(min_char_count=1)
        char_vocab.build_vocab(train_examples+dev_examples)
        char_emb, bichar_emb, bichar_vocab = None, None, None
        if args.use_static_emb:
            char_emb = StaticEmbedding(char_vocab, model_path='cpt/gigaword/uni.ite50.vec',
                                       only_norm_found_vector=True).emb_vectors
            if args.bi_char:
                bichar_vocab = Vocabulary(char_type='bichar', min_char_count=1)
                bichar_vocab.build_vocab(train_examples+dev_examples)
                bichar_emb = StaticEmbedding(bichar_vocab, model_path='cpt/gigaword/bi.ite50.vec',
                                             only_norm_found_vector=True).emb_vectors

        cache_data['train_data'] = train_examples
        cache_data['dev_data'] = dev_examples
        cache_data['char_emb'] = char_emb
        cache_data['bichar_emb'] = bichar_emb
        cache_data['char_vocab'] = char_vocab.word2idx
        cache_data['bichar_vocab'] = bichar_vocab.word2idx if bichar_vocab is not None else []
        cache_data['entity_type'] = reader.ent_type

        pickle.dump(cache_data, open(cache_data_file, 'wb'))
    else:
        logging.info('loadding  file {}'.format(cache_data_file))
        cache_data = load(cache_data_file)
    logging.info('train examples size is {}'.format(len(cache_data['train_data'])))
    logging.info('dev examples size is {}'.format(len(cache_data['dev_data'])))
    logging.info("total char vocabulary size is {} ".format(len(cache_data['char_vocab'])))
    logging.info("total bichar vocabulary size is {} ".format(len(cache_data['bichar_vocab'])))
    logging.info("entity type dict is {} ".format(cache_data['entity_type']))

    convert_examples_features = Feature(args, char_vocab=cache_data['char_vocab'],
                                        bichar_vocab=cache_data['bichar_vocab'], entity_type=cache_data['entity_type'])

    train_examples = cache_data['train_data'][:20] if debug else cache_data['train_data']
    dev_examples = cache_data['dev_data'][:20] if debug else cache_data['dev_data']

    train_data_set = convert_examples_features(train_examples, entity_type=args.entity_type, data_type='train')
    dev_data_set = convert_examples_features(dev_examples, entity_type=args.entity_type, data_type='dev')
    train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory)
    dev_data_loader = dev_data_set.get_dataloader(args.dev_batch_size)

    data_loaders = train_data_loader, dev_data_loader
    eval_examples = train_examples, dev_examples
    model_conf = {'char_vocab': cache_data['char_vocab'], 'bichar_vocab': cache_data['bichar_vocab'],
                  'char_emb': cache_data['char_emb'], 'bichar_emb': cache_data['bichar_emb'],
                  'entity_type': cache_data['entity_type']}

    return eval_examples, data_loaders, model_conf
def merge_crawled_data(folder_name, file_type, output_path):
    # "id": parent_id, "label": parent_labels[ii], "link_to": parent_link_tos[ii]
    file_names = file_util.get_file_name_in_dir_regex(folder_name, file_type)
    data_dumped = {}
    for file_name in file_names:
        # print("file_name", file_name)
        entity_dict = file_util.load(file_name)
        print(entity_dict)
        for entity_id in entity_dict:
            linkto_infos = entity_dict[entity_id]["parents"]
            for linkto_info in linkto_infos:
                source_id = str(linkto_info['id']).split('/')[-1]
                dest_id = linkto_info['link_to']
                data_dumped[source_id] = data_dumped.get(source_id, [])
                data_dumped[dest_id] = data_dumped.get(dest_id, [])
                if dest_id not in data_dumped[source_id] and dest_id != '':
                    data_dumped[source_id].append(dest_id)
    file_util.dump(data_dumped,
                   output_path + '.pck')  #"iteration3_data_dumped.pck"
    with open(output_path + '.json', 'w') as outfile:
        json.dump(data_dumped, outfile)
Exemplo n.º 6
0
        # res_list.append({'text': raw_text[index], 'rel': rel_})
        res_list.append({
            '方案': '+'.join(list(set(ent_candidate))),
            '药物名称': ent,
            '属性': list(set(rel_))
        })
    return res_list


if __name__ == '__main__':
    args = get_args()
    file_input = False
    if args.train_mode == 'predict':

        char2idx = load(args.cache_data + "/char_dict.pkl")
        args.char_vocab_size = len(char2idx)
        model = load_model(args)
        if file_input:
            file_path = 'data/DrugSPOData/cgywzl.json'
            data_json = read_json(file_path)
            for data_ in tqdm(data_json):
                for data_ans in data_['ans']:
                    if len(data_ans[0]) <= 5:
                        res = [{"方案": "", "药物名称": [], "属性": []}]
                    else:
                        res = model_predict(args,
                                            model,
                                            char2idx,
                                            raw_text=[data_ans[0]])
                    data_ans[0] = {