def __init__(self, path, args): """Inits tokenized sentence and positive pair for MTB. Args: path: path to your dataset. args: args from command line. Returns: No returns Raises: If the dataset in `path` is not the same format as described in file 'prepare_data.py', there may raise: - `key nor found` - `integer can't be indexed` and so on. """ self.path = path self.args = args data = json.load(open(os.path.join(path, "mtbdata.json"))) entityMarker = EntityMarker() # Important Configures tot_sentence = len(data) # Converts tokens to ids and meanwhile `BLANK` some entities randomly. self.tokens = np.zeros((tot_sentence, args.max_length), dtype=int) self.mask = np.zeros((tot_sentence, args.max_length), dtype=int) self.h_pos = np.zeros((tot_sentence), dtype=int) self.t_pos = np.zeros((tot_sentence), dtype=int) for i, sentence in enumerate(data): h_flag = random.random() > args.alpha t_flag = random.random() > args.alpha h_p = sentence["h"]["pos"][0] t_p = sentence["t"]["pos"][0] ids, ph, pt = entityMarker.tokenize(sentence["tokens"], [h_p[0], h_p[-1] + 1], [t_p[0], t_p[-1] + 1], None, None, h_flag, t_flag) length = min(len(ids), args.max_length) self.tokens[i][0:length] = ids[0:length] self.mask[i][0:length] = 1 self.h_pos[i] = min(args.max_length - 1, ph) self.t_pos[i] = min(args.max_length - 1, pt) print( "The number of sentence in which tokenizer can't find head/tail entity is %d" % entityMarker.err) entpair2scope = json.load( open(os.path.join(path, "entpair2scope.json"))) entpair2negpair = json.load( open(os.path.join(path, "entpair2negpair.json"))) self.pos_pair = [] # Generates all positive pair. for key in entpair2scope.keys(): self.pos_pair.extend(self.__pos_pair__(entpair2scope[key])) print("Positive pairs' number is %d" % len(self.pos_pair)) # Samples negative pairs dynamically. self.__sample__()
def __init__(self, path, args): """Inits tokenized sentence and positive pair for CP. Args: path: path to your dataset. args: args from command line. Returns: No returns Raises: If the dataset in `path` is not the same format as described in file 'prepare_data.py', there may raise: - `key nor found` - `integer can't be indexed` and so on. """ self.path = path self.args = args data = json.load(open(os.path.join(path, "cpdata_dedup.json"))) rel2scope = json.load(open(os.path.join(path, "rel2scope.json"))) entityMarker = EntityMarker(args) self.tokens = np.zeros((len(data), args.max_length), dtype=int) self.mask = np.zeros((len(data), args.max_length), dtype=int) self.label = np.zeros((len(data)), dtype=int) self.h_pos = np.zeros((len(data)), dtype=int) self.t_pos = np.zeros((len(data)), dtype=int) # Distant supervised label for sentence. # Sentences whose label are the same in a batch # is positive pair, otherwise negative pair. for i, rel in tqdm(enumerate(rel2scope.keys()), "relation types"): scope = rel2scope[rel] for j in range(scope[0], scope[1]): self.label[j] = i for i, sentence in tqdm(enumerate(data), "sentences"): h_flag = random.random() > args.alpha t_flag = random.random() > args.alpha h_p = sentence["h"]["pos"][0] t_p = sentence["t"]["pos"][0] ids, ph, pt = entityMarker.tokenize(sentence["tokens"], [h_p[0], h_p[-1] + 1], [t_p[0], t_p[-1] + 1], None, None, h_flag, t_flag) length = min(len(ids), args.max_length) self.tokens[i][:length] = ids[:length] self.mask[i][:length] = 1 self.h_pos[i] = min(args.max_length - 1, ph) self.t_pos[i] = min(args.max_length - 1, pt) print( "The number of sentence in which tokenizer can't find head/tail entity is %d" % entityMarker.err) # Samples positive pair dynamically. self.__sample__()
def __init__(self, path, mode, args): data = [] with open(os.path.join(path, mode)) as f: all_lines = f.readlines() for line in all_lines: ins = json.loads(line) data.append(ins) entityMarker = EntityMarker(args) tot_instance = len(data) # load rel2id and type2id if os.path.exists(os.path.join(path, "rel2id.json")): rel2id = json.load(open(os.path.join(path, "rel2id.json"))) else: raise Exception("Error: There is no `rel2id.json` in " + path + ".") if os.path.exists(os.path.join(path, "type2id.json")): type2id = json.load(open(os.path.join(path, "type2id.json"))) else: print( "Warning: There is no `type2id.json` in " + path + ", If you want to train model using `OT`, `CT` settings, please firstly run `utils.py` to get `type2id.json`." ) print("开始预处理文件 " + mode) # pre process data self.input_ids = np.zeros((tot_instance, args.max_length), dtype=int) self.mask = np.zeros((tot_instance, args.max_length), dtype=int) self.h_pos = np.zeros((tot_instance), dtype=int) self.t_pos = np.zeros((tot_instance), dtype=int) self.label = np.zeros((tot_instance), dtype=int) print_num = 5 for i, ins in enumerate(data): self.label[i] = rel2id[ins["relation"]] # tokenize, 上下文+实体提及 if args.mode == "CM": ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos']) elif args.mode == "OC": ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], None, None, True, True) elif args.mode == "CT": h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] + 10) t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] + 10) ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], h_type, t_type) elif args.mode == "OM": head = entityMarker.tokenizer.tokenize(ins['h']['name']) tail = entityMarker.tokenizer.tokenize(ins['t']['name']) h_first = ins['h']['pos'][0] < ins['t']['pos'][0] ids, ph, pt = entityMarker.tokenize_OMOT(head, tail, h_first) elif args.mode == "OT": h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] + 10) t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] + 10) h_first = ins['h']['pos'][0] < ins['t']['pos'][0] ids, ph, pt = entityMarker.tokenize_OMOT([ h_type, ], [ t_type, ], h_first) else: raise Exception( "No such mode! Please make sure that `mode` takes the value in {CM,OC,CT,OM,OT}" ) length = min(len(ids), args.max_length) self.input_ids[i][0:length] = ids[0:length] self.mask[i][0:length] = 1 self.h_pos[i] = min(ph, args.max_length - 1) self.t_pos[i] = min(pt, args.max_length - 1) #打印前几个元素样本示例 if i < print_num: print(f"打印第{i+1}个样本") print(f'样本的tokens {data[i]["token"]}') print(f"样本的inputs_ids{self.input_ids[i]}") print( '输入样本的模式是: .... "[unused0] " + 实体1 + " [unused1] .... [unused2] " + 实体2 + " [unused3]"...' ) print( f"样本的inputs_ids到tokens{entityMarker.tokenizer.convert_ids_to_tokens(self.input_ids[i])}" ) print(f"样本的mask {self.mask[i]}") print(f"样本的第一个实体 {data[i]['h']['name']}") print( f"样本的第一个实体的位置, 输入到模型中的位置,这个[unused0]位置用来预测关系 {self.h_pos[i]}" ) print(f"样本的第二个实体 {data[i]['t']['name']}") print( f"样本的第二个实体的位置,输入到模型中的位置,这个[unused2]位置用来预测关系 {self.t_pos[i]}" ) print(f"样本的的标签labelid {self.label[i]}") print(f"样本的的标签label {data[i]['relation']}") print() print("tokenizer无法找到头/尾实体的句子数量为 %d" % entityMarker.err)
def __init__(self, path, mode, args): data = [] with open(os.path.join(path, mode)) as f: all_lines = f.readlines() for line in all_lines: ins = json.loads(line) data.append(ins) entityMarker = EntityMarker(args) tot_instance = len(data) # load rel2id and type2id if os.path.exists(os.path.join(path, "rel2id.json")): rel2id = json.load(open(os.path.join(path, "rel2id.json"))) else: raise Exception("Error: There is no `rel2id.json` in " + path + ".") if os.path.exists(os.path.join(path, "type2id.json")): type2id = json.load(open(os.path.join(path, "type2id.json"))) else: print( "Warning: There is no `type2id.json` in " + path + ", If you want to train model using `OT`, `CT` settings, please firstly run `utils.py` to get `type2id.json`." ) print("pre process " + mode) # pre process data self.input_ids = np.zeros((tot_instance, args.max_length), dtype=int) self.mask = np.zeros((tot_instance, args.max_length), dtype=int) self.h_pos = np.zeros((tot_instance), dtype=int) self.t_pos = np.zeros((tot_instance), dtype=int) self.label = np.zeros((tot_instance), dtype=int) for i, ins in enumerate(data): self.label[i] = rel2id[ins["relation"]] # tokenize if args.mode == "CM": ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos']) elif args.mode == "OC": ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], None, None, True, True) elif args.mode == "CT": h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] + 10) t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] + 10) ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], h_type, t_type) elif args.mode == "OM": head = entityMarker.tokenizer.tokenize(ins['h']['name']) tail = entityMarker.tokenizer.tokenize(ins['t']['name']) h_first = ins['h']['pos'][0] < ins['t']['pos'][0] ids, ph, pt = entityMarker.tokenize_OMOT(head, tail, h_first) elif args.mode == "OT": h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] + 10) t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] + 10) h_first = ins['h']['pos'][0] < ins['t']['pos'][0] ids, ph, pt = entityMarker.tokenize_OMOT([ h_type, ], [ t_type, ], h_first) else: raise Exception( "No such mode! Please make sure that `mode` takes the value in {CM,OC,CT,OM,OT}" ) length = min(len(ids), args.max_length) self.input_ids[i][0:length] = ids[0:length] self.mask[i][0:length] = 1 self.h_pos[i] = min(ph, args.max_length - 1) self.t_pos[i] = min(pt, args.max_length - 1) print( "The number of sentence in which tokenizer can't find head/tail entity is %d" % entityMarker.err)
def __init__(self, path, args): """ 初始化MTB的tokenized sentence和positive pair。 Args: path: path to your dataset. args: args from command line. Returns: No returns Raises: If the dataset in `path` is not the same format as described in file 'prepare_data.py', there may raise: - `key nor found` - `integer can't be indexed` and so on. """ self.path = path self.args = args data = json.load(open(os.path.join(path, "mtbdata.json"))) # 将原始文本转换为BERT输入的ID,并找到实体位置。 entityMarker = EntityMarker() # Important Configures, 句子总数 tot_sentence = len(data) # 将token转换为ID,同时将某些实体随机化为“BLANK”。 # 初始化 tokens, mask , h_pos, t_pos self.tokens = np.zeros((tot_sentence, args.max_length), dtype=int) self.mask = np.zeros((tot_sentence, args.max_length), dtype=int) self.h_pos = np.zeros((tot_sentence), dtype=int) self.t_pos = np.zeros((tot_sentence), dtype=int) #迭代数据 for i, sentence in enumerate(data): # token被替换成BLANK的概率 h_flag = random.random() > args.alpha t_flag = random.random() > args.alpha # 实体1和实体2的位置, eg: h_p: [8, 9, 10, 11] h_p = sentence["h"]["pos"][0] t_p = sentence["t"]["pos"][0] # 将原始文本转换为BERT输入的ID,并找到实体位置。ids是句子tokenizer后的id,ph是 头实体位置(头实体标记的起始位置)。这里是[unused0]的位置, # pt 是尾部实体位置(尾部实体标记的起始位置)。这里是[unused2]的位置 ids, ph, pt = entityMarker.tokenize(sentence["tokens"], [h_p[0], h_p[-1] + 1], [t_p[0], t_p[-1] + 1], None, None, h_flag, t_flag) # 为了计算mask, length = min(len(ids), args.max_length) #把默认为0的token替换成实际的tokenid,其余部分相当于padding了 self.tokens[i][0:length] = ids[0:length] # 只有对应有tokenid的位置时1,其它默认为0了 self.mask[i][0:length] = 1 #表明第一个实体的位置, self.h_pos[i] = min(args.max_length - 1, ph) self.t_pos[i] = min(args.max_length - 1, pt) print("tokenizer找不到头/尾实体的句子数%d" % entityMarker.err) entpair2scope = json.load( open(os.path.join(path, "entpair2scope.json"))) entpair2negpair = json.load( open(os.path.join(path, "entpair2negpair.json"))) self.pos_pair = [] # 生成所有正样本对。 eg: self.pos_pair: [[0, 1], [2, 3], [4, 5]] for key in entpair2scope.keys(): self.pos_pair.extend(self.__pos_pair__(entpair2scope[key])) print("Positive pairs 数量是 %d" % len(self.pos_pair)) # 负样本动态采样 self.__sample__()