Пример #1
0
        def collate(examples):
            p_ids, examples = zip(*examples)
            p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long)
            batch_token_ids, batch_segment_ids = [], []
            batch_token_type_ids, batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [], []
            for example in examples:
                # todo maxlen
                token_ids, segment_ids = self.tokenizer.encode(
                    example.context, max_length=self.max_len)
                example.bert_tokens = self.tokenizer.tokenize(example.context)
                example.token_ids = token_ids
                if self.is_train:
                    spoes = {}
                    for s, p, o in example.gold_answer:
                        s = self.tokenizer.encode(s)[0][1:-1]
                        p = BAIDU_RELATION[p]
                        o = self.tokenizer.encode(o)[0][1:-1]
                        s_idx = search(s, token_ids)
                        o_idx = search(o, token_ids)
                        if s_idx != -1 and o_idx != -1:
                            s = (s_idx, s_idx + len(s) - 1)
                            o = (o_idx, o_idx + len(o) - 1, p)
                            if s not in spoes:
                                spoes[s] = []
                            spoes[s].append(o)

                    if spoes:
                        # subject标签
                        token_type_ids = np.zeros(len(token_ids),
                                                  dtype=np.long)
                        subject_labels = np.zeros((len(token_ids), 2),
                                                  dtype=np.float32)
                        for s in spoes:
                            subject_labels[s[0], 0] = 1
                            subject_labels[s[1], 1] = 1
                        # 随机选一个subject
                        start, end = np.array(list(spoes.keys())).T
                        start = np.random.choice(start)
                        end = np.random.choice(end[end >= start])
                        token_type_ids[start:end + 1] = 1
                        subject_ids = (start, end)
                        # 对应的object标签
                        object_labels = np.zeros(
                            (len(token_ids), len(BAIDU_RELATION), 2),
                            dtype=np.float32)
                        for o in spoes.get(subject_ids, []):
                            object_labels[o[0], o[2], 0] = 1
                            object_labels[o[1], o[2], 1] = 1
                        batch_token_ids.append(token_ids)
                        batch_token_type_ids.append(token_type_ids)

                        batch_segment_ids.append(segment_ids)
                        batch_subject_labels.append(subject_labels)
                        batch_subject_ids.append(subject_ids)
                        batch_object_labels.append(object_labels)
                else:
                    batch_token_ids.append(token_ids)
                    batch_segment_ids.append(segment_ids)

            batch_token_ids = sequence_padding(batch_token_ids, is_float=False)
            batch_segment_ids = sequence_padding(batch_segment_ids,
                                                 is_float=False)
            if not self.is_train:
                return p_ids, batch_token_ids, batch_segment_ids
            else:
                batch_token_type_ids = sequence_padding(batch_token_type_ids,
                                                        is_float=False)
                batch_subject_ids = torch.tensor(batch_subject_ids)
                batch_subject_labels = sequence_padding(batch_subject_labels,
                                                        padding=np.zeros(2),
                                                        is_float=True)
                batch_object_labels = sequence_padding(
                    batch_object_labels,
                    padding=np.zeros((len(BAIDU_RELATION), 2)),
                    is_float=True)
                return batch_token_ids, batch_segment_ids, batch_token_type_ids, batch_subject_ids, batch_subject_labels, batch_object_labels
Пример #2
0
    def _read(self, filename, data_type):

        examples = []
        with codecs.open(filename, 'r') as f:
            gold_num = 0
            p_id = 0
            for line in tqdm(f):
                p_id += 1
                data_json = json.loads(line.strip())

                text_raw = data_json['text'].lower()

                tokens = []
                for token in text_raw:
                    tokens.append(token)
                    if len(tokens) >= self.max_seq_length - 2:
                        break

                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                sub_po_dict, sub_ent_list, spo_list = dict(), list(), list()
                spoes = {}
                for spo in data_json['spo_list']:

                    for spo_object in spo['object'].keys():
                        # assign relation label
                        if spo['predicate'] in self.spo_conf:
                            # simple relation
                            predicate_label = self.spo_conf[spo['predicate']]
                            subject_sub_tokens = list(spo['subject'])
                            object_sub_tokens = list(spo['object']['@value'])
                            # todo 补充spo_v2版本时的处理逻辑
                            sub_ent_list.append(spo['subject'].lower())
                            spo_list.append(
                                (spo['subject'].lower(), spo['predicate'],
                                 spo['object']['@value'].lower()))
                        else:
                            # complex relation
                            predicate_label = self.spo_conf[spo['predicate'] +
                                                            '_' + spo_object]
                            subject_sub_tokens = list(spo['subject'])
                            object_sub_tokens = list(spo['object'][spo_object])

                        subject_start, object_start = search_spo_index(
                            tokens, subject_sub_tokens, object_sub_tokens)
                        if subject_start == -1:
                            subject_start = search(subject_sub_tokens, tokens)
                        if object_start == -1:
                            object_start = search(object_sub_tokens, tokens)

                        if subject_start != -1 and object_start != -1:
                            s = (subject_start,
                                 subject_start + len(subject_sub_tokens) - 1)
                            o = (object_start,
                                 object_start + len(object_sub_tokens) - 1,
                                 predicate_label)
                            if s not in spoes:
                                spoes[s] = []
                            spoes[s].append(o)

                examples.append(
                    Example(p_id=p_id,
                            context=text_raw,
                            bert_tokens=tokens,
                            sub_entity_list=list(set(sub_ent_list)),
                            gold_answer=spo_list,
                            spoes=spoes))
                gold_num += len(set(spo_list))
        print('total gold num is {}'.format(gold_num))

        logging.info("{} total size is  {} ".format(data_type, len(examples)))

        return examples
Пример #3
0
        def collate(examples):
            p_ids, examples = zip(*examples)
            p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long)
            batch_char_ids, batch_word_ids = [], []
            batch_token_type_ids, batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [], []
            for example in examples:
                # todo maxlen
                char_ids = [
                    self.char2idx.get(char, 1) for char in example.context
                ]
                word_ids = [
                    self.word2idx.get(word, 0) for word in example.text_word
                    for _ in word
                ]
                if len(char_ids) != len(word_ids):
                    print(example.context)
                    print(char_ids)
                    print(len(char_ids))
                    print(example.text_word)
                    print(word_ids)
                    print(len(word_ids))
                assert len(char_ids) == len(word_ids)
                char_ids = char_ids[:self.max_len]
                word_ids = word_ids[:self.max_len]
                # example.context = example.context[:self.max_len]

                if self.is_train:
                    spoes = {}
                    for s, p, o in example.gold_answer:
                        s = [self.char2idx.get(s_, 1) for s_ in s]
                        p = BAIDU_RELATION[p]
                        o = [self.char2idx.get(o_, 1) for o_ in o]
                        s_idx = search(s, char_ids)
                        o_idx = search(o, char_ids)
                        if s_idx != -1 and o_idx != -1:
                            s = (s_idx, s_idx + len(s) - 1)
                            o = (o_idx, o_idx + len(o) - 1, p)
                            if s not in spoes:
                                spoes[s] = []
                            spoes[s].append(o)

                    if spoes:
                        # subject标签
                        token_type_ids = np.zeros(len(char_ids), dtype=np.long)
                        subject_labels = np.zeros((len(char_ids), 2),
                                                  dtype=np.float32)
                        for s in spoes:
                            subject_labels[s[0], 0] = 1
                            subject_labels[s[1], 1] = 1
                        # 随机选一个subject
                        start, end = np.array(list(spoes.keys())).T
                        start = np.random.choice(start)
                        end = np.random.choice(end[end >= start])
                        token_type_ids[start:end + 1] = 1
                        subject_ids = (start, end)
                        # 对应的object标签
                        object_labels = np.zeros(
                            (len(char_ids), len(BAIDU_RELATION), 2),
                            dtype=np.float32)
                        for o in spoes.get(subject_ids, []):
                            object_labels[o[0], o[2], 0] = 1
                            object_labels[o[1], o[2], 1] = 1
                        batch_char_ids.append(char_ids)
                        batch_word_ids.append(word_ids)
                        batch_token_type_ids.append(token_type_ids)
                        batch_subject_labels.append(subject_labels)
                        batch_subject_ids.append(subject_ids)
                        batch_object_labels.append(object_labels)
                else:
                    batch_char_ids.append(char_ids)
                    batch_word_ids.append(word_ids)

            batch_char_ids = sequence_padding(batch_char_ids, is_float=False)
            batch_word_ids = sequence_padding(batch_word_ids, is_float=False)
            if not self.is_train:
                return p_ids, batch_char_ids, batch_word_ids
            else:
                batch_token_type_ids = sequence_padding(batch_token_type_ids,
                                                        is_float=False)
                batch_subject_ids = torch.tensor(batch_subject_ids)
                batch_subject_labels = sequence_padding(batch_subject_labels,
                                                        padding=np.zeros(2),
                                                        is_float=True)
                batch_object_labels = sequence_padding(
                    batch_object_labels,
                    padding=np.zeros((len(BAIDU_RELATION), 2)),
                    is_float=True)
                return batch_char_ids, batch_word_ids, batch_token_type_ids, batch_subject_ids, batch_subject_labels, batch_object_labels
Пример #4
0
    def _read(self, filename, data_type):
        '''
        构建训练数据集
        complex relation
        当面对complex的sample 比如 下面这组关系
        [{subject:'张智尧','predicate':'饰演','object':{'@value':'楚留香','inWork':'楚留香传奇'}},
        {subject:'楚留香传奇',‘predicate’:'主演','object':{'@value':'张智尧'}}]
        我们需要需要把它改写成:
        {('楚留香传奇'):(‘张智尧‘,’主演‘),(楚留香):('楚留香传奇','饰演_inwork'),
        ('张智尧'):('楚留香',‘饰演_@value’)}
        插入example
        '''
        examples = []
        with open(filename, 'r') as fr:
            p_id = 0
            for line in tqdm(fr.readlines()):
                p_id += 1
                src_data = json.loads(line)
                text_raw = src_data['text']
                text_raw = text_raw.replace('®', '')
                text_raw = text_raw.replace('◆', '')

                tokens, tok_to_orig_start_index, tok_to_orig_end_index = covert_to_tokens(
                    text_raw,
                    self.tokenizer,
                    self.max_seq_length,
                    return_orig_index=True)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                sub_po_dict, sub_ent_list, spo_list = dict(), list(), list()
                if 'spo_list' not in src_data:
                    examples.append(
                        Example(
                            p_id=p_id,
                            raw_text=src_data['text'],
                            context=text_raw,
                            tok_to_orig_start_index=tok_to_orig_start_index,
                            tok_to_orig_end_index=tok_to_orig_end_index,
                            bert_tokens=tokens,
                            sub_entity_list=None,
                            gold_answer=None,
                            spoes=None))
                else:
                    spoes = {}
                    for spo in src_data['spo_list']:
                        spo_dict = dict()
                        for spo_object in spo['object'].keys():
                            if spo['predicate'] in self.spo_conf:
                                label = spo['predicate']
                            else:
                                label = spo['predicate'] + '_' + spo_object
                            spo_dict[self.spo_conf[label]] = spo['object'][
                                spo_object]

                        for spo_object in spo['object'].keys():
                            # assign relation label
                            if spo['predicate'] in self.spo_conf:
                                # simple relation
                                predicate_label = self.spo_conf[
                                    spo['predicate']]

                                subject_sub_tokens = covert_to_tokens(
                                    spo['subject'], self.tokenizer,
                                    self.max_seq_length)
                                object_sub_tokens = covert_to_tokens(
                                    spo['object']['@value'], self.tokenizer,
                                    self.max_seq_length)
                                sub_ent_list.append(spo['subject'])
                            else:
                                #complex sample
                                complex_relation_label = [6, 8, 24, 30, 44]
                                complex_relation_affi_label = [
                                    7, 9, 25, 26, 27, 31, 45
                                ]
                                predicate_label = self.spo_conf[
                                    spo['predicate'] + '_' + spo_object]

                                if predicate_label in complex_relation_affi_label:
                                    subject_sub_tokens = covert_to_tokens(
                                        spo['object']['@value'],
                                        self.tokenizer, self.max_seq_length)
                                    sub_ent_list.append(
                                        spo['object']['@value'])
                                else:
                                    subject_sub_tokens = covert_to_tokens(
                                        spo['subject'], self.tokenizer,
                                        self.max_seq_length)
                                    sub_ent_list.append(spo['subject'])
                                object_sub_tokens = covert_to_tokens(
                                    spo['object'][spo_object], self.tokenizer,
                                    self.max_seq_length)

                            subject_start, object_start = search_spo_index(
                                tokens, subject_sub_tokens, object_sub_tokens)
                            if subject_start == -1:
                                subject_start = search(subject_sub_tokens,
                                                       tokens)
                            if object_start == -1:
                                object_start = search(object_sub_tokens,
                                                      tokens)

                            if subject_start != -1 and object_start != -1:
                                s = (subject_start, subject_start +
                                     len(subject_sub_tokens) - 1)
                                o = (object_start,
                                     object_start + len(object_sub_tokens) - 1,
                                     predicate_label)
                                if s not in spoes:
                                    spoes[s] = []
                                spoes[s].append(o)

                    examples.append(
                        Example(
                            p_id=p_id,
                            raw_text=src_data['text'],
                            context=text_raw,
                            tok_to_orig_start_index=tok_to_orig_start_index,
                            tok_to_orig_end_index=tok_to_orig_end_index,
                            bert_tokens=tokens,
                            sub_entity_list=sub_ent_list,
                            gold_answer=src_data['spo_list'],
                            spoes=spoes))
        # print('total gold num is {}'.format(gold_num))

        logging.info("{} total size is  {} ".format(data_type, len(examples)))

        return examples
Пример #5
0
    def _read(self, filename, data_type):
        complex_relation_label = [6, 8, 24, 30, 44]
        complex_relation_affi_label = [7, 9, 25, 26, 27, 31, 45]
        examples = []
        with open(filename, 'r') as fr:
            p_id = 0
            for line in tqdm(fr.readlines()):
                p_id += 1
                src_data = json.loads(line)
                text_raw = src_data['text']
                text_raw = text_raw.replace('®', '')
                text_raw = text_raw.replace('◆', '')

                tokens, tok_to_orig_start_index, tok_to_orig_end_index = covert_to_tokens(
                    text_raw,
                    self.tokenizer,
                    self.max_seq_length,
                    return_orig_index=True)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                sub_po_dict, sub_ent_list, spo_list = dict(), list(), list()
                spoes = {}
                for spo in src_data['spo_list']:

                    spo_dict = dict()
                    for spo_object in spo['object'].keys():
                        if spo['predicate'] in self.spo_conf:
                            label = spo['predicate']
                        else:
                            label = spo['predicate'] + '_' + spo_object
                        spo_dict[
                            self.spo_conf[label]] = spo['object'][spo_object]

                    for spo_object in spo['object'].keys():
                        # assign relation label
                        if spo['predicate'] in self.spo_conf:
                            # simple relation
                            predicate_label = self.spo_conf[spo['predicate']]

                            subject_sub_tokens = covert_to_tokens(
                                spo['subject'], self.tokenizer,
                                self.max_seq_length)
                            object_sub_tokens = covert_to_tokens(
                                spo['object']['@value'], self.tokenizer,
                                self.max_seq_length)
                            sub_ent_list.append(spo['subject'])
                        else:
                            # complex relation

                            predicate_label = self.spo_conf[spo['predicate'] +
                                                            '_' + spo_object]

                            if predicate_label in complex_relation_affi_label:
                                subject_sub_tokens = covert_to_tokens(
                                    spo['object']['@value'], self.tokenizer,
                                    self.max_seq_length)
                                sub_ent_list.append(spo['object']['@value'])
                            else:
                                subject_sub_tokens = covert_to_tokens(
                                    spo['subject'], self.tokenizer,
                                    self.max_seq_length)
                                sub_ent_list.append(spo['subject'])
                            object_sub_tokens = covert_to_tokens(
                                spo['object'][spo_object], self.tokenizer,
                                self.max_seq_length)

                        subject_start, object_start = search_spo_index(
                            tokens, subject_sub_tokens, object_sub_tokens)
                        if subject_start == -1:
                            subject_start = search(subject_sub_tokens, tokens)
                        if object_start == -1:
                            object_start = search(object_sub_tokens, tokens)

                        if subject_start != -1 and object_start != -1:
                            s = (subject_start,
                                 subject_start + len(subject_sub_tokens) - 1)
                            o = (object_start,
                                 object_start + len(object_sub_tokens) - 1,
                                 predicate_label)
                            if s not in spoes:
                                spoes[s] = []
                            spoes[s].append(o)
                examples.append(
                    Example(p_id=p_id,
                            context=text_raw,
                            tok_to_orig_start_index=tok_to_orig_start_index,
                            tok_to_orig_end_index=tok_to_orig_end_index,
                            bert_tokens=tokens,
                            sub_entity_list=sub_ent_list,
                            gold_answer=src_data['spo_list'],
                            spoes=spoes))
                if data_type == 'train':
                    flag = False
                    for s, o in spoes.items():
                        for (o1, o2, p) in o:
                            if p in complex_relation_affi_label:
                                flag = True
                                continue
                        if not flag:
                            continue
                        tmp_spoes = {}
                        tmp_spoes[s] = spoes[s]

                        examples.append(
                            Example(
                                p_id=p_id,
                                context=text_raw,
                                tok_to_orig_start_index=tok_to_orig_start_index,
                                tok_to_orig_end_index=tok_to_orig_end_index,
                                bert_tokens=tokens,
                                sub_entity_list=sub_ent_list,
                                gold_answer=src_data['spo_list'],
                                spoes=tmp_spoes))
                # else:
                #     examples.append(
                #         Example(
                #             p_id=p_id,
                #             context=text_raw,
                #             tok_to_orig_start_index=tok_to_orig_start_index,
                #             tok_to_orig_end_index=tok_to_orig_end_index,
                #             bert_tokens=tokens,
                #             sub_entity_list=sub_ent_list,
                #             gold_answer=src_data['spo_list'],
                #             spoes=None
                #         ))

        # print('total gold num is {}'.format(gold_num))

        logging.info("{} total size is  {} ".format(data_type, len(examples)))

        return examples
Пример #6
0
        def collate(examples):
            p_ids, examples = zip(*examples)
            p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long)
            batch_char_ids, batch_word_ids = [], []
            batch_token_type_ids, batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [], []
            for example in examples:
                # todo maxlen
                char_ids = [
                    self.char2idx.get(char, 1) for char in example.context
                ]
                word_ids = [
                    self.word2idx.get(word, 0) for word in example.text_word
                    for _ in word
                ]
                if len(char_ids) != len(word_ids):
                    print(example.context)
                    print(char_ids)
                    print(len(char_ids))
                    print(example.text_word)
                    print(word_ids)
                    print(len(word_ids))
                assert len(char_ids) == len(word_ids)
                char_ids = char_ids[:self.max_len]
                word_ids = word_ids[:self.max_len]
                # example.context = example.context[:self.max_len]

                if self.is_train:
                    spoes = {}
                    for s, p, o in example.gold_answer:
                        s = [self.char2idx.get(s_, 1) for s_ in s]
                        # p = BAIDU_RELATION[p]
                        o = [self.char2idx.get(o_, 1) for o_ in o]
                        s_idx = search(s, char_ids)
                        o_idx = search(o, char_ids)
                        if s_idx != -1 and o_idx != -1:
                            s = (s_idx, s_idx + len(s) - 1)
                            o = (o_idx, o_idx + len(o) - 1, p)
                            if s not in spoes:
                                spoes[s] = []
                            spoes[s].append(o)

                    if spoes:
                        # subject标签
                        token_type_ids = np.zeros(len(char_ids), dtype=np.long)
                        subject_labels = np.zeros(len(char_ids), dtype=np.int)
                        for s in spoes:
                            subject_labels[s[0]] = BAIDU_ENTITY['B']
                            for index in range(s[0] + 1, s[1] + 1):
                                subject_labels[index] = BAIDU_ENTITY['I']
                        # 随机选一个subject
                        subject_ids = random.choice(list(spoes.keys()))

                        token_type_ids[subject_ids[0]:subject_ids[1] + 1] = 1
                        # 对应的object标签
                        object_labels = np.zeros(len(char_ids), dtype=np.int)
                        for o in spoes.get(subject_ids, []):
                            object_labels[o[0]] = BAIDU_BIES['B' + '-' + o[2]]
                            for index in range(o[0] + 1, o[1] + 1):
                                object_labels[index] = BAIDU_BIES['I' + '-' +
                                                                  o[2]]
                        batch_char_ids.append(char_ids)
                        batch_word_ids.append(word_ids)
                        batch_token_type_ids.append(token_type_ids)
                        batch_subject_labels.append(subject_labels)
                        batch_subject_ids.append(subject_ids)
                        batch_object_labels.append(object_labels)
                else:
                    batch_char_ids.append(char_ids)
                    batch_word_ids.append(word_ids)

            batch_char_ids = sequence_padding(batch_char_ids, is_float=False)
            batch_word_ids = sequence_padding(batch_word_ids, is_float=False)
            if not self.is_train:
                return p_ids, batch_char_ids, batch_word_ids
            else:
                batch_token_type_ids = sequence_padding(batch_token_type_ids,
                                                        is_float=False)
                batch_subject_ids = torch.tensor(batch_subject_ids)
                batch_subject_labels = sequence_padding(batch_subject_labels,
                                                        is_float=False)
                batch_object_labels = sequence_padding(batch_object_labels,
                                                       is_float=False)
                return batch_char_ids, batch_word_ids, batch_token_type_ids, batch_subject_ids, batch_subject_labels, batch_object_labels
Пример #7
0
    def _read(self, filename, data_type):

        examples = []
        with codecs.open(filename, 'r') as f:
            gold_num = 0
            p_id = 0
            for line in tqdm(f):
                p_id += 1
                data_json = json.loads(line.strip())

                text_raw = data_json['text'].lower()
                sub_text = []
                buff = ""
                for char in text_raw:
                    if chineseandpunctuationextractor.is_chinese_or_punct(char):
                        if buff != "":
                            sub_text.append(buff)
                            buff = ""
                        sub_text.append(char)
                    else:
                        buff += char
                if buff != "":
                    sub_text.append(buff)
                # todo 注意:推断的时候应该移除CLS 和 SEP
                tok_to_orig_start_index = []
                tok_to_orig_end_index = []
                tokens = []
                text_tmp = ''
                for (i, token) in enumerate(sub_text):
                    sub_tokens = self.tokenizer.tokenize(token) if token != ' ' else []
                    text_tmp += token
                    for sub_token in sub_tokens:
                        tok_to_orig_start_index.append(len(text_tmp) - len(token))
                        tok_to_orig_end_index.append(len(text_tmp) - 1)
                        tokens.append(sub_token)
                        if len(tokens) >= self.max_seq_length - 2:
                            break
                    else:
                        continue
                    break

                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                sub_po_dict, sub_ent_list, spo_list = dict(), list(), list()
                spoes = {}
                for spo in data_json['spo_list']:

                    for spo_object in spo['object'].keys():
                        # assign relation label
                        if spo['predicate'] in self.spo_conf:
                            # simple relation
                            predicate_label = self.spo_conf[spo['predicate']]
                            subject_sub_tokens = self.tokenizer.tokenize(spo['subject'])
                            object_sub_tokens = self.tokenizer.tokenize(spo['object'][
                                                                            '@value'])
                            # todo 补充spo_v2版本时的处理逻辑
                            sub_ent_list.append(spo['subject'].lower())
                            spo_list.append((spo['subject'].lower(), spo['predicate'], spo['object']['@value'].lower()))
                        else:
                            # complex relation
                            predicate_label = self.spo_conf[spo['predicate'] + '_' +
                                                            spo_object]
                            subject_sub_tokens = self.tokenizer.tokenize(spo['subject'])
                            object_sub_tokens = self.tokenizer.tokenize(spo['object'][
                                                                            spo_object])

                        subject_start, object_start = search_spo_index(tokens, subject_sub_tokens, object_sub_tokens)
                        if subject_start == -1:
                            subject_start = search(subject_sub_tokens, tokens)
                        if object_start == -1:
                            object_start = search(object_sub_tokens, tokens)

                        if subject_start != -1 and object_start != -1:
                            s = (subject_start, subject_start + len(subject_sub_tokens) - 1)
                            o = (object_start, object_start + len(object_sub_tokens) - 1, predicate_label)
                            if s not in spoes:
                                spoes[s] = []
                            spoes[s].append(o)

                examples.append(
                    Example(
                        p_id=p_id,
                        context=text_raw,
                        tok_to_orig_start_index=tok_to_orig_start_index,
                        tok_to_orig_end_index=tok_to_orig_end_index,
                        bert_tokens=tokens,
                        sub_entity_list=list(set(sub_ent_list)),
                        gold_answer=spo_list,
                        spoes=spoes

                    )
                )
                gold_num += len(set(spo_list))
        print('total gold num is {}'.format(gold_num))

        logging.info("{} total size is  {} ".format(data_type, len(examples)))

        return examples
Пример #8
0
        def collate(examples):
            p_ids, examples = zip(*examples)
            p_ids = torch.tensor([p_id for p_id in p_ids], dtype=torch.long)
            batch_char_ids, batch_word_ids = [], []
            batch_ent_labels, batch_rel_labels = [], []
            for example in examples:
                # print("example: ", example)
                # todo maxlen
                char_ids = [
                    self.char2idx.get(char, 1) for char in example.context
                ]
                # 句子的字序列和词序列,长度不同,为了对齐,一个词中的每个字,都对应了所在词的idx,这样保证了,char_ids和word_ids的长度一样
                # 进入模型中,相当与一个词中每个字符都对应了这个词的embedding
                word_ids = [
                    self.word2idx.get(word, 0) for word in example.text_word
                    for _ in word
                ]
                # word_ids = [self.word2idx.get(word, 0) for word in example.text_word]
                if len(char_ids) != len(word_ids):
                    print(example.context)
                    print(char_ids)
                    print(len(char_ids))
                    print(example.text_word)
                    print(word_ids)
                    print(len(word_ids))
                assert len(char_ids) == len(word_ids)
                char_ids = char_ids[:self.max_len]
                word_ids = word_ids[:self.max_len]
                example.raw_context = example.context[:self.max_len]

                if self.is_train:
                    rel_labels = []
                    bio = ['O'] * len(char_ids)
                    for s, p, o in example.gold_answer:
                        s = [self.char2idx.get(s_, 1) for s_ in s]
                        p = BAIDU_RELATION[p]
                        o = [self.char2idx.get(o_, 1) for o_ in o]
                        s_idx = search(s, char_ids)
                        o_idx = search(o, char_ids)
                        if s_idx != -1 and o_idx != -1:
                            bio[s_idx] = 'B'
                            bio[s_idx + 1:s_idx + len(s)] = 'I' * (len(s) - 1)
                            bio[o_idx] = 'B'
                            bio[o_idx + 1:o_idx + len(o)] = 'I' * (len(o) - 1)
                            s = (s_idx, s_idx + len(s) - 1)
                            o = (o_idx, o_idx + len(o) - 1, p)
                            rel_labels.append((s[1], o[1], o[2]))

                    if rel_labels:
                        ent_labels = np.zeros((len(char_ids)), dtype=np.long)
                        for index, label_ in enumerate(bio):
                            ent_labels[index] = BAIDU_ENTITY[label_]
                        batch_char_ids.append(char_ids)
                        batch_word_ids.append(word_ids)
                        batch_ent_labels.append(ent_labels)
                        batch_rel_labels.append(rel_labels)
                else:
                    batch_char_ids.append(char_ids)
                    batch_word_ids.append(word_ids)

            batch_char_ids = sequence_padding(batch_char_ids, is_float=False)
            batch_word_ids = sequence_padding(batch_word_ids, is_float=False)
            if not self.is_train:
                # print("p_ids: ", p_ids)
                # print("batch_char_ids: ", batch_char_ids)
                # print("batch_word_ids: ", batch_word_ids)
                return p_ids, batch_char_ids, batch_word_ids
            else:
                batch_ent_labels = sequence_padding(batch_ent_labels,
                                                    is_float=False)
                batch_rel_labels = select_padding(
                    batch_char_ids,
                    batch_rel_labels,
                    is_float=True,
                    class_num=len(BAIDU_RELATION))
                # print("batch_char_ids: shape=", batch_char_ids.shape, "\n", batch_char_ids)
                # print("batch_word_ids: shape=", batch_word_ids.shape, "\n", batch_word_ids)
                # print("batch_ent_labels: shape=",batch_ent_labels.shape, '\n', batch_ent_labels)
                # print("batch_rel_labels: shape=",batch_rel_labels.shape, "\n", batch_rel_labels)
                return batch_char_ids, batch_word_ids, batch_ent_labels, batch_rel_labels