def bulid_dataset(args, reader, vocab, debug=False): char2idx, char_emb = None, None train_src = args.input + "/train_data.json" dev_src = args.input + "/dev_data.json" train_examples_file = args.cache_data + "/train-examples.pkl" dev_examples_file = args.cache_data + "/dev-examples.pkl" char_emb_file = args.cache_data + "/char_emb.pkl" char_dictionary = args.cache_data + "/char_dict.pkl" if not os.path.exists(train_examples_file): train_examples = reader.read_examples(train_src, data_type='train') dev_examples = reader.read_examples(dev_src, data_type='dev') if not args.use_bert: # todo : min_word_count=3 ? vocab.build_vocab_only_with_char(train_examples, min_char_count=1) if args.use_word2vec and args.embedding_file: char_emb = vocab.make_embedding( vocab=vocab.char_vocab, embedding_file=args.embedding_file, emb_size=args.word_emb_size) save(char_emb_file, char_emb, message="char embedding") save(char_dictionary, vocab.char2idx, message="char dictionary") char2idx = vocab.char2idx save(train_examples_file, train_examples, message="train examples") save(dev_examples_file, dev_examples, message="dev examples") else: if not args.use_bert: if args.use_word2vec and args.embedding_file: char_emb = load(char_emb_file) char2idx = load(char_dictionary) logging.info("total char vocabulary size is {} ".format( len(char2idx))) train_examples, dev_examples = load(train_examples_file), load( dev_examples_file) logging.info('train examples size is {}'.format(len(train_examples))) logging.info('dev examples size is {}'.format(len(dev_examples))) if not args.use_bert: args.vocab_size = len(char2idx) convert_examples_features = Feature(args, token2idx_dict=char2idx) train_examples = train_examples[:2] if debug else train_examples dev_examples = dev_examples[:2] if debug else dev_examples train_data_set = convert_examples_features(train_examples, data_type='train') dev_data_set = convert_examples_features(dev_examples, data_type='dev') train_data_loader = train_data_set.get_dataloader( args.train_batch_size, shuffle=True, pin_memory=args.pin_memory) dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size) data_loaders = train_data_loader, dev_data_loader eval_examples = train_examples, dev_examples return eval_examples, data_loaders, char_emb
def bulid_dataset(args, spo_config, reader,tokenizer, debug=False): train_src = args.input + "/train_data.json" dev_src = args.input + "/test2_data.json" train_examples_file = args.cache_data + "/train-examples.pkl" dev_examples_file = args.cache_data + "/dev-examples.pkl" if not os.path.exists(train_examples_file): train_examples = reader.read_examples(train_src, data_type='train') dev_examples = reader.read_examples(dev_src, data_type='dev') save(train_examples_file, train_examples, message="train examples") save(dev_examples_file, dev_examples, message="dev examples") else: logging.info('loading train cache_data {}'.format(train_examples_file)) logging.info('loading dev cache_data {}'.format(dev_examples_file)) train_examples, dev_examples = load(train_examples_file), load(dev_examples_file) logging.info('train examples size is {}'.format(len(train_examples))) logging.info('dev examples size is {}'.format(len(dev_examples))) convert_examples_features = Feature(max_len=args.max_len, spo_config=spo_config, tokenizer=tokenizer) train_examples = train_examples[:2] if debug else train_examples dev_examples = dev_examples[:2] if debug else dev_examples train_data_set = convert_examples_features(train_examples, data_type='train') dev_data_set = convert_examples_features(dev_examples, data_type='dev') train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory) dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size) data_loaders = train_data_loader, dev_data_loader eval_examples = train_examples, dev_examples return eval_examples, data_loaders, tokenizer
def update_entity_details(folder_name, file_regex, output_path): file_names = file_util.get_file_name_in_dir_regex(folder_name, file_regex) link_data = {} parent_of_leaf = [] all_entities_from_mention = {} for file_name in file_names: print("file_name", file_name) entity_dict = file_util.load(file_name) # print(entity_dict) for entity_id in entity_dict: all_entities_from_mention[entity_id] = entity_dict[entity_id] linkto_infos = entity_dict[entity_id]["parents"] for linkto_info in linkto_infos: source_id = linkto_info['id'] dest_id = linkto_info['link_to'] if source_id == entity_id: parent_of_leaf.append(dest_id) else: parent_of_leaf.append(source_id) parent_of_leaf.append(dest_id) link_data[source_id] = link_data.get(source_id, []) link_data[dest_id] = link_data.get(dest_id, []) if dest_id not in link_data[source_id] and dest_id != '': link_data[source_id].append(dest_id) file_util.dump(link_data, output_path + ".pck") # "iteration3_data_dumped.pck" file_util.dump(parent_of_leaf, output_path + "_parent_leaf.pck") file_util.dump_json(link_data, output_path + ".json") des_short_name_dict = update_entity_description_shortname( link_data, all_entities_from_mention) file_util.dump_json(des_short_name_dict, output_path + "_brief.json") wiki_graph_util.convert_to_tree(link_data, des_short_name_dict) file_util.dump_json(all_entities_from_mention, output_path + "_patent_entity_relations.json") excel_tree_level_export.demo(file_util.load_json("all_entity_level.json"))
def bulid_dataset(args, debug=False): train_src = args.input + "/train.txt" dev_src = args.input + "/dev.txt" cache_data_file = args.cache_data + "/cache_data.pkl" cache_data = {} if not os.path.exists(cache_data_file): reader = Reader(bi_char=args.bi_char) train_examples = reader.read_examples(train_src, data_type='train') dev_examples = reader.read_examples(dev_src, data_type='dev') char_vocab = Vocabulary(min_char_count=1) char_vocab.build_vocab(train_examples+dev_examples) char_emb, bichar_emb, bichar_vocab = None, None, None if args.use_static_emb: char_emb = StaticEmbedding(char_vocab, model_path='cpt/gigaword/uni.ite50.vec', only_norm_found_vector=True).emb_vectors if args.bi_char: bichar_vocab = Vocabulary(char_type='bichar', min_char_count=1) bichar_vocab.build_vocab(train_examples+dev_examples) bichar_emb = StaticEmbedding(bichar_vocab, model_path='cpt/gigaword/bi.ite50.vec', only_norm_found_vector=True).emb_vectors cache_data['train_data'] = train_examples cache_data['dev_data'] = dev_examples cache_data['char_emb'] = char_emb cache_data['bichar_emb'] = bichar_emb cache_data['char_vocab'] = char_vocab.word2idx cache_data['bichar_vocab'] = bichar_vocab.word2idx if bichar_vocab is not None else [] cache_data['entity_type'] = reader.ent_type pickle.dump(cache_data, open(cache_data_file, 'wb')) else: logging.info('loadding file {}'.format(cache_data_file)) cache_data = load(cache_data_file) logging.info('train examples size is {}'.format(len(cache_data['train_data']))) logging.info('dev examples size is {}'.format(len(cache_data['dev_data']))) logging.info("total char vocabulary size is {} ".format(len(cache_data['char_vocab']))) logging.info("total bichar vocabulary size is {} ".format(len(cache_data['bichar_vocab']))) logging.info("entity type dict is {} ".format(cache_data['entity_type'])) convert_examples_features = Feature(args, char_vocab=cache_data['char_vocab'], bichar_vocab=cache_data['bichar_vocab'], entity_type=cache_data['entity_type']) train_examples = cache_data['train_data'][:20] if debug else cache_data['train_data'] dev_examples = cache_data['dev_data'][:20] if debug else cache_data['dev_data'] train_data_set = convert_examples_features(train_examples, entity_type=args.entity_type, data_type='train') dev_data_set = convert_examples_features(dev_examples, entity_type=args.entity_type, data_type='dev') train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory) dev_data_loader = dev_data_set.get_dataloader(args.dev_batch_size) data_loaders = train_data_loader, dev_data_loader eval_examples = train_examples, dev_examples model_conf = {'char_vocab': cache_data['char_vocab'], 'bichar_vocab': cache_data['bichar_vocab'], 'char_emb': cache_data['char_emb'], 'bichar_emb': cache_data['bichar_emb'], 'entity_type': cache_data['entity_type']} return eval_examples, data_loaders, model_conf
def merge_crawled_data(folder_name, file_type, output_path): # "id": parent_id, "label": parent_labels[ii], "link_to": parent_link_tos[ii] file_names = file_util.get_file_name_in_dir_regex(folder_name, file_type) data_dumped = {} for file_name in file_names: # print("file_name", file_name) entity_dict = file_util.load(file_name) print(entity_dict) for entity_id in entity_dict: linkto_infos = entity_dict[entity_id]["parents"] for linkto_info in linkto_infos: source_id = str(linkto_info['id']).split('/')[-1] dest_id = linkto_info['link_to'] data_dumped[source_id] = data_dumped.get(source_id, []) data_dumped[dest_id] = data_dumped.get(dest_id, []) if dest_id not in data_dumped[source_id] and dest_id != '': data_dumped[source_id].append(dest_id) file_util.dump(data_dumped, output_path + '.pck') #"iteration3_data_dumped.pck" with open(output_path + '.json', 'w') as outfile: json.dump(data_dumped, outfile)
# res_list.append({'text': raw_text[index], 'rel': rel_}) res_list.append({ '方案': '+'.join(list(set(ent_candidate))), '药物名称': ent, '属性': list(set(rel_)) }) return res_list if __name__ == '__main__': args = get_args() file_input = False if args.train_mode == 'predict': char2idx = load(args.cache_data + "/char_dict.pkl") args.char_vocab_size = len(char2idx) model = load_model(args) if file_input: file_path = 'data/DrugSPOData/cgywzl.json' data_json = read_json(file_path) for data_ in tqdm(data_json): for data_ans in data_['ans']: if len(data_ans[0]) <= 5: res = [{"方案": "", "药物名称": [], "属性": []}] else: res = model_predict(args, model, char2idx, raw_text=[data_ans[0]]) data_ans[0] = {