def collate(examples): p_ids, examples = zip(*examples) p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long) batch_token_ids, batch_segment_ids = [], [] batch_token_type_ids, batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [], [] for example in examples: # todo maxlen token_ids, segment_ids = self.tokenizer.encode( example.context, max_length=self.max_len) example.bert_tokens = self.tokenizer.tokenize(example.context) example.token_ids = token_ids if self.is_train: spoes = {} for s, p, o in example.gold_answer: s = self.tokenizer.encode(s)[0][1:-1] p = BAIDU_RELATION[p] o = self.tokenizer.encode(o)[0][1:-1] s_idx = search(s, token_ids) o_idx = search(o, token_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 token_type_ids = np.zeros(len(token_ids), dtype=np.long) subject_labels = np.zeros((len(token_ids), 2), dtype=np.float32) for s in spoes: subject_labels[s[0], 0] = 1 subject_labels[s[1], 1] = 1 # 随机选一个subject start, end = np.array(list(spoes.keys())).T start = np.random.choice(start) end = np.random.choice(end[end >= start]) token_type_ids[start:end + 1] = 1 subject_ids = (start, end) # 对应的object标签 object_labels = np.zeros( (len(token_ids), len(BAIDU_RELATION), 2), dtype=np.float32) for o in spoes.get(subject_ids, []): object_labels[o[0], o[2], 0] = 1 object_labels[o[1], o[2], 1] = 1 batch_token_ids.append(token_ids) batch_token_type_ids.append(token_type_ids) batch_segment_ids.append(segment_ids) batch_subject_labels.append(subject_labels) batch_subject_ids.append(subject_ids) batch_object_labels.append(object_labels) else: batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_token_ids = sequence_padding(batch_token_ids, is_float=False) batch_segment_ids = sequence_padding(batch_segment_ids, is_float=False) if not self.is_train: return p_ids, batch_token_ids, batch_segment_ids else: batch_token_type_ids = sequence_padding(batch_token_type_ids, is_float=False) batch_subject_ids = torch.tensor(batch_subject_ids) batch_subject_labels = sequence_padding(batch_subject_labels, padding=np.zeros(2), is_float=True) batch_object_labels = sequence_padding( batch_object_labels, padding=np.zeros((len(BAIDU_RELATION), 2)), is_float=True) return batch_token_ids, batch_segment_ids, batch_token_type_ids, batch_subject_ids, batch_subject_labels, batch_object_labels
def _read(self, filename, data_type): examples = [] with codecs.open(filename, 'r') as f: gold_num = 0 p_id = 0 for line in tqdm(f): p_id += 1 data_json = json.loads(line.strip()) text_raw = data_json['text'].lower() tokens = [] for token in text_raw: tokens.append(token) if len(tokens) >= self.max_seq_length - 2: break tokens = ["[CLS]"] + tokens + ["[SEP]"] sub_po_dict, sub_ent_list, spo_list = dict(), list(), list() spoes = {} for spo in data_json['spo_list']: for spo_object in spo['object'].keys(): # assign relation label if spo['predicate'] in self.spo_conf: # simple relation predicate_label = self.spo_conf[spo['predicate']] subject_sub_tokens = list(spo['subject']) object_sub_tokens = list(spo['object']['@value']) # todo 补充spo_v2版本时的处理逻辑 sub_ent_list.append(spo['subject'].lower()) spo_list.append( (spo['subject'].lower(), spo['predicate'], spo['object']['@value'].lower())) else: # complex relation predicate_label = self.spo_conf[spo['predicate'] + '_' + spo_object] subject_sub_tokens = list(spo['subject']) object_sub_tokens = list(spo['object'][spo_object]) subject_start, object_start = search_spo_index( tokens, subject_sub_tokens, object_sub_tokens) if subject_start == -1: subject_start = search(subject_sub_tokens, tokens) if object_start == -1: object_start = search(object_sub_tokens, tokens) if subject_start != -1 and object_start != -1: s = (subject_start, subject_start + len(subject_sub_tokens) - 1) o = (object_start, object_start + len(object_sub_tokens) - 1, predicate_label) if s not in spoes: spoes[s] = [] spoes[s].append(o) examples.append( Example(p_id=p_id, context=text_raw, bert_tokens=tokens, sub_entity_list=list(set(sub_ent_list)), gold_answer=spo_list, spoes=spoes)) gold_num += len(set(spo_list)) print('total gold num is {}'.format(gold_num)) logging.info("{} total size is {} ".format(data_type, len(examples))) return examples
def collate(examples): p_ids, examples = zip(*examples) p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long) batch_char_ids, batch_word_ids = [], [] batch_token_type_ids, batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [], [] for example in examples: # todo maxlen char_ids = [ self.char2idx.get(char, 1) for char in example.context ] word_ids = [ self.word2idx.get(word, 0) for word in example.text_word for _ in word ] if len(char_ids) != len(word_ids): print(example.context) print(char_ids) print(len(char_ids)) print(example.text_word) print(word_ids) print(len(word_ids)) assert len(char_ids) == len(word_ids) char_ids = char_ids[:self.max_len] word_ids = word_ids[:self.max_len] # example.context = example.context[:self.max_len] if self.is_train: spoes = {} for s, p, o in example.gold_answer: s = [self.char2idx.get(s_, 1) for s_ in s] p = BAIDU_RELATION[p] o = [self.char2idx.get(o_, 1) for o_ in o] s_idx = search(s, char_ids) o_idx = search(o, char_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 token_type_ids = np.zeros(len(char_ids), dtype=np.long) subject_labels = np.zeros((len(char_ids), 2), dtype=np.float32) for s in spoes: subject_labels[s[0], 0] = 1 subject_labels[s[1], 1] = 1 # 随机选一个subject start, end = np.array(list(spoes.keys())).T start = np.random.choice(start) end = np.random.choice(end[end >= start]) token_type_ids[start:end + 1] = 1 subject_ids = (start, end) # 对应的object标签 object_labels = np.zeros( (len(char_ids), len(BAIDU_RELATION), 2), dtype=np.float32) for o in spoes.get(subject_ids, []): object_labels[o[0], o[2], 0] = 1 object_labels[o[1], o[2], 1] = 1 batch_char_ids.append(char_ids) batch_word_ids.append(word_ids) batch_token_type_ids.append(token_type_ids) batch_subject_labels.append(subject_labels) batch_subject_ids.append(subject_ids) batch_object_labels.append(object_labels) else: batch_char_ids.append(char_ids) batch_word_ids.append(word_ids) batch_char_ids = sequence_padding(batch_char_ids, is_float=False) batch_word_ids = sequence_padding(batch_word_ids, is_float=False) if not self.is_train: return p_ids, batch_char_ids, batch_word_ids else: batch_token_type_ids = sequence_padding(batch_token_type_ids, is_float=False) batch_subject_ids = torch.tensor(batch_subject_ids) batch_subject_labels = sequence_padding(batch_subject_labels, padding=np.zeros(2), is_float=True) batch_object_labels = sequence_padding( batch_object_labels, padding=np.zeros((len(BAIDU_RELATION), 2)), is_float=True) return batch_char_ids, batch_word_ids, batch_token_type_ids, batch_subject_ids, batch_subject_labels, batch_object_labels
def _read(self, filename, data_type): ''' 构建训练数据集 complex relation 当面对complex的sample 比如 下面这组关系 [{subject:'张智尧','predicate':'饰演','object':{'@value':'楚留香','inWork':'楚留香传奇'}}, {subject:'楚留香传奇',‘predicate’:'主演','object':{'@value':'张智尧'}}] 我们需要需要把它改写成: {('楚留香传奇'):(‘张智尧‘,’主演‘),(楚留香):('楚留香传奇','饰演_inwork'), ('张智尧'):('楚留香',‘饰演_@value’)} 插入example ''' examples = [] with open(filename, 'r') as fr: p_id = 0 for line in tqdm(fr.readlines()): p_id += 1 src_data = json.loads(line) text_raw = src_data['text'] text_raw = text_raw.replace('®', '') text_raw = text_raw.replace('◆', '') tokens, tok_to_orig_start_index, tok_to_orig_end_index = covert_to_tokens( text_raw, self.tokenizer, self.max_seq_length, return_orig_index=True) tokens = ["[CLS]"] + tokens + ["[SEP]"] sub_po_dict, sub_ent_list, spo_list = dict(), list(), list() if 'spo_list' not in src_data: examples.append( Example( p_id=p_id, raw_text=src_data['text'], context=text_raw, tok_to_orig_start_index=tok_to_orig_start_index, tok_to_orig_end_index=tok_to_orig_end_index, bert_tokens=tokens, sub_entity_list=None, gold_answer=None, spoes=None)) else: spoes = {} for spo in src_data['spo_list']: spo_dict = dict() for spo_object in spo['object'].keys(): if spo['predicate'] in self.spo_conf: label = spo['predicate'] else: label = spo['predicate'] + '_' + spo_object spo_dict[self.spo_conf[label]] = spo['object'][ spo_object] for spo_object in spo['object'].keys(): # assign relation label if spo['predicate'] in self.spo_conf: # simple relation predicate_label = self.spo_conf[ spo['predicate']] subject_sub_tokens = covert_to_tokens( spo['subject'], self.tokenizer, self.max_seq_length) object_sub_tokens = covert_to_tokens( spo['object']['@value'], self.tokenizer, self.max_seq_length) sub_ent_list.append(spo['subject']) else: #complex sample complex_relation_label = [6, 8, 24, 30, 44] complex_relation_affi_label = [ 7, 9, 25, 26, 27, 31, 45 ] predicate_label = self.spo_conf[ spo['predicate'] + '_' + spo_object] if predicate_label in complex_relation_affi_label: subject_sub_tokens = covert_to_tokens( spo['object']['@value'], self.tokenizer, self.max_seq_length) sub_ent_list.append( spo['object']['@value']) else: subject_sub_tokens = covert_to_tokens( spo['subject'], self.tokenizer, self.max_seq_length) sub_ent_list.append(spo['subject']) object_sub_tokens = covert_to_tokens( spo['object'][spo_object], self.tokenizer, self.max_seq_length) subject_start, object_start = search_spo_index( tokens, subject_sub_tokens, object_sub_tokens) if subject_start == -1: subject_start = search(subject_sub_tokens, tokens) if object_start == -1: object_start = search(object_sub_tokens, tokens) if subject_start != -1 and object_start != -1: s = (subject_start, subject_start + len(subject_sub_tokens) - 1) o = (object_start, object_start + len(object_sub_tokens) - 1, predicate_label) if s not in spoes: spoes[s] = [] spoes[s].append(o) examples.append( Example( p_id=p_id, raw_text=src_data['text'], context=text_raw, tok_to_orig_start_index=tok_to_orig_start_index, tok_to_orig_end_index=tok_to_orig_end_index, bert_tokens=tokens, sub_entity_list=sub_ent_list, gold_answer=src_data['spo_list'], spoes=spoes)) # print('total gold num is {}'.format(gold_num)) logging.info("{} total size is {} ".format(data_type, len(examples))) return examples
def _read(self, filename, data_type): complex_relation_label = [6, 8, 24, 30, 44] complex_relation_affi_label = [7, 9, 25, 26, 27, 31, 45] examples = [] with open(filename, 'r') as fr: p_id = 0 for line in tqdm(fr.readlines()): p_id += 1 src_data = json.loads(line) text_raw = src_data['text'] text_raw = text_raw.replace('®', '') text_raw = text_raw.replace('◆', '') tokens, tok_to_orig_start_index, tok_to_orig_end_index = covert_to_tokens( text_raw, self.tokenizer, self.max_seq_length, return_orig_index=True) tokens = ["[CLS]"] + tokens + ["[SEP]"] sub_po_dict, sub_ent_list, spo_list = dict(), list(), list() spoes = {} for spo in src_data['spo_list']: spo_dict = dict() for spo_object in spo['object'].keys(): if spo['predicate'] in self.spo_conf: label = spo['predicate'] else: label = spo['predicate'] + '_' + spo_object spo_dict[ self.spo_conf[label]] = spo['object'][spo_object] for spo_object in spo['object'].keys(): # assign relation label if spo['predicate'] in self.spo_conf: # simple relation predicate_label = self.spo_conf[spo['predicate']] subject_sub_tokens = covert_to_tokens( spo['subject'], self.tokenizer, self.max_seq_length) object_sub_tokens = covert_to_tokens( spo['object']['@value'], self.tokenizer, self.max_seq_length) sub_ent_list.append(spo['subject']) else: # complex relation predicate_label = self.spo_conf[spo['predicate'] + '_' + spo_object] if predicate_label in complex_relation_affi_label: subject_sub_tokens = covert_to_tokens( spo['object']['@value'], self.tokenizer, self.max_seq_length) sub_ent_list.append(spo['object']['@value']) else: subject_sub_tokens = covert_to_tokens( spo['subject'], self.tokenizer, self.max_seq_length) sub_ent_list.append(spo['subject']) object_sub_tokens = covert_to_tokens( spo['object'][spo_object], self.tokenizer, self.max_seq_length) subject_start, object_start = search_spo_index( tokens, subject_sub_tokens, object_sub_tokens) if subject_start == -1: subject_start = search(subject_sub_tokens, tokens) if object_start == -1: object_start = search(object_sub_tokens, tokens) if subject_start != -1 and object_start != -1: s = (subject_start, subject_start + len(subject_sub_tokens) - 1) o = (object_start, object_start + len(object_sub_tokens) - 1, predicate_label) if s not in spoes: spoes[s] = [] spoes[s].append(o) examples.append( Example(p_id=p_id, context=text_raw, tok_to_orig_start_index=tok_to_orig_start_index, tok_to_orig_end_index=tok_to_orig_end_index, bert_tokens=tokens, sub_entity_list=sub_ent_list, gold_answer=src_data['spo_list'], spoes=spoes)) if data_type == 'train': flag = False for s, o in spoes.items(): for (o1, o2, p) in o: if p in complex_relation_affi_label: flag = True continue if not flag: continue tmp_spoes = {} tmp_spoes[s] = spoes[s] examples.append( Example( p_id=p_id, context=text_raw, tok_to_orig_start_index=tok_to_orig_start_index, tok_to_orig_end_index=tok_to_orig_end_index, bert_tokens=tokens, sub_entity_list=sub_ent_list, gold_answer=src_data['spo_list'], spoes=tmp_spoes)) # else: # examples.append( # Example( # p_id=p_id, # context=text_raw, # tok_to_orig_start_index=tok_to_orig_start_index, # tok_to_orig_end_index=tok_to_orig_end_index, # bert_tokens=tokens, # sub_entity_list=sub_ent_list, # gold_answer=src_data['spo_list'], # spoes=None # )) # print('total gold num is {}'.format(gold_num)) logging.info("{} total size is {} ".format(data_type, len(examples))) return examples
def collate(examples): p_ids, examples = zip(*examples) p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long) batch_char_ids, batch_word_ids = [], [] batch_token_type_ids, batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [], [] for example in examples: # todo maxlen char_ids = [ self.char2idx.get(char, 1) for char in example.context ] word_ids = [ self.word2idx.get(word, 0) for word in example.text_word for _ in word ] if len(char_ids) != len(word_ids): print(example.context) print(char_ids) print(len(char_ids)) print(example.text_word) print(word_ids) print(len(word_ids)) assert len(char_ids) == len(word_ids) char_ids = char_ids[:self.max_len] word_ids = word_ids[:self.max_len] # example.context = example.context[:self.max_len] if self.is_train: spoes = {} for s, p, o in example.gold_answer: s = [self.char2idx.get(s_, 1) for s_ in s] # p = BAIDU_RELATION[p] o = [self.char2idx.get(o_, 1) for o_ in o] s_idx = search(s, char_ids) o_idx = search(o, char_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 token_type_ids = np.zeros(len(char_ids), dtype=np.long) subject_labels = np.zeros(len(char_ids), dtype=np.int) for s in spoes: subject_labels[s[0]] = BAIDU_ENTITY['B'] for index in range(s[0] + 1, s[1] + 1): subject_labels[index] = BAIDU_ENTITY['I'] # 随机选一个subject subject_ids = random.choice(list(spoes.keys())) token_type_ids[subject_ids[0]:subject_ids[1] + 1] = 1 # 对应的object标签 object_labels = np.zeros(len(char_ids), dtype=np.int) for o in spoes.get(subject_ids, []): object_labels[o[0]] = BAIDU_BIES['B' + '-' + o[2]] for index in range(o[0] + 1, o[1] + 1): object_labels[index] = BAIDU_BIES['I' + '-' + o[2]] batch_char_ids.append(char_ids) batch_word_ids.append(word_ids) batch_token_type_ids.append(token_type_ids) batch_subject_labels.append(subject_labels) batch_subject_ids.append(subject_ids) batch_object_labels.append(object_labels) else: batch_char_ids.append(char_ids) batch_word_ids.append(word_ids) batch_char_ids = sequence_padding(batch_char_ids, is_float=False) batch_word_ids = sequence_padding(batch_word_ids, is_float=False) if not self.is_train: return p_ids, batch_char_ids, batch_word_ids else: batch_token_type_ids = sequence_padding(batch_token_type_ids, is_float=False) batch_subject_ids = torch.tensor(batch_subject_ids) batch_subject_labels = sequence_padding(batch_subject_labels, is_float=False) batch_object_labels = sequence_padding(batch_object_labels, is_float=False) return batch_char_ids, batch_word_ids, batch_token_type_ids, batch_subject_ids, batch_subject_labels, batch_object_labels
def _read(self, filename, data_type): examples = [] with codecs.open(filename, 'r') as f: gold_num = 0 p_id = 0 for line in tqdm(f): p_id += 1 data_json = json.loads(line.strip()) text_raw = data_json['text'].lower() sub_text = [] buff = "" for char in text_raw: if chineseandpunctuationextractor.is_chinese_or_punct(char): if buff != "": sub_text.append(buff) buff = "" sub_text.append(char) else: buff += char if buff != "": sub_text.append(buff) # todo 注意:推断的时候应该移除CLS 和 SEP tok_to_orig_start_index = [] tok_to_orig_end_index = [] tokens = [] text_tmp = '' for (i, token) in enumerate(sub_text): sub_tokens = self.tokenizer.tokenize(token) if token != ' ' else [] text_tmp += token for sub_token in sub_tokens: tok_to_orig_start_index.append(len(text_tmp) - len(token)) tok_to_orig_end_index.append(len(text_tmp) - 1) tokens.append(sub_token) if len(tokens) >= self.max_seq_length - 2: break else: continue break tokens = ["[CLS]"] + tokens + ["[SEP]"] sub_po_dict, sub_ent_list, spo_list = dict(), list(), list() spoes = {} for spo in data_json['spo_list']: for spo_object in spo['object'].keys(): # assign relation label if spo['predicate'] in self.spo_conf: # simple relation predicate_label = self.spo_conf[spo['predicate']] subject_sub_tokens = self.tokenizer.tokenize(spo['subject']) object_sub_tokens = self.tokenizer.tokenize(spo['object'][ '@value']) # todo 补充spo_v2版本时的处理逻辑 sub_ent_list.append(spo['subject'].lower()) spo_list.append((spo['subject'].lower(), spo['predicate'], spo['object']['@value'].lower())) else: # complex relation predicate_label = self.spo_conf[spo['predicate'] + '_' + spo_object] subject_sub_tokens = self.tokenizer.tokenize(spo['subject']) object_sub_tokens = self.tokenizer.tokenize(spo['object'][ spo_object]) subject_start, object_start = search_spo_index(tokens, subject_sub_tokens, object_sub_tokens) if subject_start == -1: subject_start = search(subject_sub_tokens, tokens) if object_start == -1: object_start = search(object_sub_tokens, tokens) if subject_start != -1 and object_start != -1: s = (subject_start, subject_start + len(subject_sub_tokens) - 1) o = (object_start, object_start + len(object_sub_tokens) - 1, predicate_label) if s not in spoes: spoes[s] = [] spoes[s].append(o) examples.append( Example( p_id=p_id, context=text_raw, tok_to_orig_start_index=tok_to_orig_start_index, tok_to_orig_end_index=tok_to_orig_end_index, bert_tokens=tokens, sub_entity_list=list(set(sub_ent_list)), gold_answer=spo_list, spoes=spoes ) ) gold_num += len(set(spo_list)) print('total gold num is {}'.format(gold_num)) logging.info("{} total size is {} ".format(data_type, len(examples))) return examples
def collate(examples): p_ids, examples = zip(*examples) p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long) batch_char_ids, batch_word_ids = [], [] batch_ent_labels, batch_rel_labels = [], [] for example in examples: # print("example: ", example) # todo maxlen char_ids = [ self.char2idx.get(char, 1) for char in example.context ] # 句子的字序列和词序列,长度不同,为了对齐,一个词中的每个字,都对应了所在词的idx,这样保证了,char_ids和word_ids的长度一样 # 进入模型中,相当与一个词中每个字符都对应了这个词的embedding word_ids = [ self.word2idx.get(word, 0) for word in example.text_word for _ in word ] # word_ids = [self.word2idx.get(word, 0) for word in example.text_word] if len(char_ids) != len(word_ids): print(example.context) print(char_ids) print(len(char_ids)) print(example.text_word) print(word_ids) print(len(word_ids)) assert len(char_ids) == len(word_ids) char_ids = char_ids[:self.max_len] word_ids = word_ids[:self.max_len] example.raw_context = example.context[:self.max_len] if self.is_train: rel_labels = [] bio = ['O'] * len(char_ids) for s, p, o in example.gold_answer: s = [self.char2idx.get(s_, 1) for s_ in s] p = BAIDU_RELATION[p] o = [self.char2idx.get(o_, 1) for o_ in o] s_idx = search(s, char_ids) o_idx = search(o, char_ids) if s_idx != -1 and o_idx != -1: bio[s_idx] = 'B' bio[s_idx + 1:s_idx + len(s)] = 'I' * (len(s) - 1) bio[o_idx] = 'B' bio[o_idx + 1:o_idx + len(o)] = 'I' * (len(o) - 1) s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) rel_labels.append((s[1], o[1], o[2])) if rel_labels: ent_labels = np.zeros((len(char_ids)), dtype=np.long) for index, label_ in enumerate(bio): ent_labels[index] = BAIDU_ENTITY[label_] batch_char_ids.append(char_ids) batch_word_ids.append(word_ids) batch_ent_labels.append(ent_labels) batch_rel_labels.append(rel_labels) else: batch_char_ids.append(char_ids) batch_word_ids.append(word_ids) batch_char_ids = sequence_padding(batch_char_ids, is_float=False) batch_word_ids = sequence_padding(batch_word_ids, is_float=False) if not self.is_train: # print("p_ids: ", p_ids) # print("batch_char_ids: ", batch_char_ids) # print("batch_word_ids: ", batch_word_ids) return p_ids, batch_char_ids, batch_word_ids else: batch_ent_labels = sequence_padding(batch_ent_labels, is_float=False) batch_rel_labels = select_padding( batch_char_ids, batch_rel_labels, is_float=True, class_num=len(BAIDU_RELATION)) # print("batch_char_ids: shape=", batch_char_ids.shape, "\n", batch_char_ids) # print("batch_word_ids: shape=", batch_word_ids.shape, "\n", batch_word_ids) # print("batch_ent_labels: shape=",batch_ent_labels.shape, '\n', batch_ent_labels) # print("batch_rel_labels: shape=",batch_rel_labels.shape, "\n", batch_rel_labels) return batch_char_ids, batch_word_ids, batch_ent_labels, batch_rel_labels