Пример #1
0
def extract_items(text_in):
    _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in text_in]
    _X1_MASK = [1] * len(_X1)
    _X1 = torch.tensor([_X1], dtype=torch.long, device=device)  # [1,s1]
    _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device)
    _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device)

    with torch.no_grad():
        _k1, _k2, _ = subject_model(device, _X1, _X1_SEG, _X1_MASK)  # _k1:[1,s]
        _k1 = _k1[0, :].detach().cpu().numpy()
        _k2 = _k2[0, :].detach().cpu().numpy()
        _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0]

    _subjects = []
    if len(_k1) and len(_k2):
        for i in _k1:
            j = _k2[_k2 >= i]
            if len(j) > 0:
                j = j[0]
                _subject = text_in[i:j + 1]
                if _subject in kb2id:
                    if _subject in freq:
                        if freq[_subject]['per'] > 0:
                            _subjects.append((_subject, str(i), str(j + 1)))
                    else:
                        _subjects.append((_subject, str(i), str(j + 1)))

    # subject补余
    for _s in match2(text_in):
        if _s[0] in freq:
            if freq[_s[0]]['per'] > 0.8:
                _subjects.append(_s)
    return list(set(_subjects))
Пример #2
0
def extract_items(text_in):
    _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in text_in]
    _X1_MASK = [1] * len(_X1)
    _X1 = torch.tensor([_X1], dtype=torch.long, device=device)  # [1,s1]
    _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device)
    _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device)

    with torch.no_grad():
        _k1, _k2, _ = subject_model(device, _X1, _X1_SEG,
                                    _X1_MASK)  # _k1:[1,s]
        _k1 = _k1[0, :].detach().cpu().numpy()
        _k2 = _k2[0, :].detach().cpu().numpy()
        _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0]

    _subjects = []
    if len(_k1) and len(_k2):
        for i in _k1:
            j = _k2[_k2 >= i]
            if len(j) > 0:
                j = j[0]
                _subject = text_in[i:j + 1]
                if _subject in kb2id:
                    if _subject in freq:
                        if freq[_subject]['per'] > 0:
                            _subjects.append((_subject, str(i), str(j + 1)))
                    else:
                        _subjects.append((_subject, str(i), str(j + 1)))

    # subject补余
    for _s in match2(text_in):
        if _s[0] in freq:
            if freq[_s[0]]['per'] > 0.8:
                _subjects.append(_s)

    _subjects = list(set(_subjects))
    _subjects_new = _subjects.copy()
    for _s, _s_s, _s_e in _subjects:
        for _i, _i_s, _i_e in _subjects:
            if _s_s == _i_s and _s_e != _i_e and _s in group:
                if group[_s]['group_labeled_per'] > 1.5 * group[_s][
                        's_same_per'] and (_i, _i_s, _i_e) in _subjects_new:
                    _subjects_new.remove((_i, _i_s, _i_e))
                if group[_s]['s_same_per'] > 1.5 * group[_s][
                        'group_labeled_per'] and (_s, _s_s,
                                                  _s_e) in _subjects_new:
                    _subjects_new.remove((_s, _s_s, _s_e))

            if _s_s != _i_s and _s_e == _i_e and _s in group:
                if group[_s]['group_labeled_per'] > 1.5 * group[_s][
                        'e_same_per'] and (_i, _i_s, _i_e) in _subjects_new:
                    _subjects_new.remove((_i, _i_s, _i_e))
                if group[_s]['e_same_per'] > 1.5 * group[_s][
                        'group_labeled_per'] and (_s, _s_s,
                                                  _s_e) in _subjects_new:
                    _subjects_new.remove((_s, _s_s, _s_e))

    return list(set(_subjects_new))
Пример #3
0
def extract_items(text_in):
    _x1_tokens = jieba.lcut(text_in)
    _x1 = ''.join(_x1_tokens)
    assert len(_x1) == len(text_in)

    _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in _x1]
    _X1_MASK = [1] * len(_X1)
    _X1 = torch.tensor([_X1], dtype=torch.long, device=device)  # [1,s1]
    _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device)
    _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device)
    _X1_WV = torch.tensor(seq2vec([_x1_tokens]),
                          dtype=torch.float32,
                          device=device)

    with torch.no_grad():
        _k1, _k2, _x1_hs, _x1_h = subject_model('x1', device, _X1_WV, _X1,
                                                _X1_SEG, _X1_MASK)  # _k1:[1,s]
        _k1 = _k1[0, :].detach().cpu().numpy()
        _k2 = _k2[0, :].detach().cpu().numpy()
        _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0]

    _subjects = []
    if len(_k1) and len(_k2):
        for i in _k1:
            j = _k2[_k2 >= i]
            if len(j) > 0:
                j = j[0]
                _subject = text_in[i:j + 1]
                if _subject in kb2id:
                    _subjects.append((_subject, str(i), str(j + 1)))

    # subject补余
    for _s in match2(text_in):
        if _s[0] in freq:
            if freq[_s[0]]['per'] > 0.8:
                _subjects.append(_s)

    return list(set(_subjects))
Пример #4
0
def extract_items(text_in):
    _x1_tokens = jieba.lcut(text_in)
    _x1 = ''.join(_x1_tokens)
    assert len(_x1) == len(text_in)

    _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in _x1]
    _X1_MASK = [1] * len(_X1)
    _X1 = torch.tensor([_X1], dtype=torch.long, device=device)  # [1,s1]
    _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device)
    _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device)
    _X1_WV = torch.tensor(seq2vec([_x1_tokens]),
                          dtype=torch.float32,
                          device=device)

    with torch.no_grad():
        _k1, _k2, _x1_hs, _x1_h = subject_model('x1', device, _X1_WV, _X1,
                                                _X1_SEG, _X1_MASK)  # _k1:[1,s]
        _k1 = _k1[0, :].detach().cpu().numpy()
        _k2 = _k2[0, :].detach().cpu().numpy()
        _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0]

    _subjects = []
    if len(_k1) and len(_k2):
        for i in _k1:
            j = _k2[_k2 >= i]
            if len(j) > 0:
                j = j[0]
                _subject = text_in[i:j + 1]
                _subjects.append((_subject, str(i), str(j + 1)))

    # subject补余
    for _s in match2(text_in):
        if _s[0] in freq:
            if freq[_s[0]]['per'] > 0.8:
                _subjects.append(_s)
    _subjects = list(set(_subjects))

    if _subjects:
        R = []
        _X2, _X2_MASK, _Y, _X2_wv = [], [], [], []
        _S, _IDXS = [], {}
        for _X1 in _subjects:
            if _X1[0] in ['的']:
                continue
            _y = np.zeros(len(text_in))
            _y[int(_X1[1]):int(_X1[2])] = 1
            _IDXS[_X1] = kb2id.get(_X1[0], [])
            # 每个subject只取10个链指
            for idx, i in enumerate(_IDXS[_X1]):
                if idx > 15:
                    break
                _x2 = id2kb[i]['subject_desc']
                _x2_tokens = jieba.lcut(_x2)
                _x2 = ''.join(_x2_tokens)
                _x2 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in _x2]
                _x2_mask = [1] * len(_x2)

                _X2.append(_x2)
                _X2_MASK.append(_x2_mask)
                _Y.append(_y)
                _S.append(_X1)
                _X2_wv.append(_x2_tokens)
        if _X2:
            _O = []
            _X2 = torch.tensor(seq_padding(_X2), dtype=torch.long)  # [b,s2]
            _X2_MASK = torch.tensor(seq_padding(_X2_MASK), dtype=torch.long)
            _X2_SEG = torch.zeros(*_X2.size(), dtype=torch.long)
            _Y = torch.tensor(seq_padding(_Y), dtype=torch.float32)
            _X1_HS = _x1_hs.expand(_X2.size(0), -1, -1)  # [b,s1,h]
            _X1_H = _x1_h.expand(_X2.size(0), -1)  # [b,s1]
            _X1_MASK = _X1_MASK.expand(_X2.size(0), -1)  # [b,s1]
            _X1_wv = _X1_WV.expand(_X2.size(0), -1, -1)  # [b,s1,200]
            _X2_wv = torch.tensor(seq2vec(_X2_wv), dtype=torch.float32)

            eval_dataloader = DataLoader(TensorDataset(_X2, _X2_SEG, _X2_MASK,
                                                       _X1_HS, _X1_H, _X1_MASK,
                                                       _Y, _X1_wv, _X2_wv),
                                         batch_size=64)

            for batch_idx, batch in enumerate(eval_dataloader):
                batch = tuple(t.to(device) for t in batch)
                _X2, _X2_SEG, _X2_MASK, _X1_HS, _X1_H, _X1_MASK, _Y, _X1_wv, _X2_wv = batch
                with torch.no_grad():
                    _x2, _x2_h = subject_model('x2', None, None, None, None,
                                               None, _X2, _X2_SEG, _X2_MASK)
                    _o, _, _ = object_model(_X1_HS, _X1_H, _X1_MASK, _Y, _x2,
                                            _x2_h, _X2_MASK, _X1_wv,
                                            _X2_wv)  # _o:[b,1]
                    _o = _o.detach().cpu().numpy()
                    _O.extend(_o)

            for k, v in groupby(zip(_S, _O), key=lambda x: x[0]):
                v = np.array([j[1] for j in v])
                kbid = _IDXS[k][np.argmax(v)]
                R.append((k[0], k[1], kbid))
        return list(set(R))
    else:
        return []
Пример #5
0
def freq():
    id2kb = {}
    for l in tqdm((Path(data_dir) / 'kb_data').open()):
        _ = json.loads(l)
        subject_id = _['subject_id']
        subject_alias = list(set([_['subject']] + _.get('alias', [])))
        subject_alias = [sa.lower() for sa in subject_alias]
        subject_desc = ''
        for i in _['data']:
            if '摘要' in i['predicate']:
                subject_desc = i['object']
                break
            else:
                subject_desc += f'{i["predicate"]}:{i["object"]}\n'

        subject_desc = subject_desc[:300].lower()
        if subject_desc:
            id2kb[subject_id] = {
                'subject_alias': subject_alias,
                'subject_desc': subject_desc
            }

    kb2id = defaultdict(list)  # subject: [sid1, sid2,...]
    for i, j in tqdm(id2kb.items()):
        for k in j['subject_alias']:
            kb2id[k].append(i)

    train_data = (Path(data_dir) / 'train.json').open()
    freq_dic = defaultdict(dict)
    cnt = 0
    for i, l in tqdm(enumerate(train_data)):
        # if i > 20000:
        #     break
        l = json.loads(l)
        t = l['text']
        exp_words = [(k, sidx) for k, sidx, _ in match2(t)]
        labeled_words = [(m['mention'], m['offset'])
                         for m in l['mention_data']]
        if not set(exp_words).issuperset(set(labeled_words)):
            cnt += 1
        for w, start_idx in exp_words:
            if 'exp' not in freq_dic[w]:
                freq_dic[w]['exp'] = 1
            else:
                freq_dic[w]['exp'] += 1
            if (w, start_idx) in labeled_words:
                if 'labeled' not in freq_dic[w]:
                    freq_dic[w]['labeled'] = 1
                else:
                    freq_dic[w]['labeled'] += 1

    # if match_rules(word)  21825
    # if word != ''  16347
    print(f'cnt: {cnt}')

    for w in freq_dic:
        if 'labeled' not in freq_dic[w]:
            freq_dic[w]['labeled'] = 0
        freq_dic[w]['per'] = freq_dic[w]['labeled'] / freq_dic[w]['exp']

    p = (Path(data_dir) / 'el_freq_dic_1.json').open('w')
    json.dump(freq_dic, p, ensure_ascii=False)
Пример #6
0
def group():
    id2kb = {}
    for l in tqdm((Path(data_dir) / 'kb_data').open(), desc='kb_data'):
        _ = json.loads(l)
        subject_id = _['subject_id']
        subject_alias = list(set([_['subject']] + _.get('alias', [])))
        subject_alias = [sa.lower() for sa in subject_alias]
        subject_desc = ''
        for i in _['data']:
            if '摘要' in i['predicate']:
                subject_desc = i['object']
                break
            else:
                subject_desc += f'{i["predicate"]}:{i["object"]}\n'

        subject_desc = subject_desc[:300].lower()
        if subject_desc:
            id2kb[subject_id] = {
                'subject_alias': subject_alias,
                'subject_desc': subject_desc
            }

    kb2id = defaultdict(list)  # subject: [sid1, sid2,...]
    for i, j in tqdm(id2kb.items()):
        for k in j['subject_alias']:
            kb2id[k].append(i)

    train_data = (Path(data_dir) / 'train.json').open()
    freq_dic = defaultdict(dict)

    cnt = 0
    tmp_p = (Path(data_dir) / 'el_group_word.json').open('w')
    tmp_dic = {}

    for i, l in tqdm(enumerate(train_data), desc='train_data 1'):
        # if i > 20000:
        #     break
        l = json.loads(l)
        t = l['text']
        exp_words = [(k, sidx, int(sidx) + len(k)) for k, sidx, _ in match2(t)]
        labeled_words = [(m['mention'], m['offset'],
                          int(m['offset']) + len(m['mention']))
                         for m in l['mention_data']]
        if not set(exp_words).issuperset(set(labeled_words)):
            for lw, lw_s, lw_e in labeled_words:
                for ew, ew_s, ew_e in exp_words:
                    if lw_s == ew_s and lw_e != ew_e and ew.startswith(lw):
                        if ew not in tmp_dic:
                            tmp_dic[ew] = defaultdict(list)
                        tmp_dic[ew]['s_same'].append(lw)

                    if lw_e == ew_e and lw_s != ew_s and ew.endswith(lw):
                        if ew not in tmp_dic:
                            tmp_dic[ew] = defaultdict(list)
                        tmp_dic[ew]['e_same'].append(lw)

                    if lw_e == ew_e and lw_s != ew_s and ew.endswith(lw):
                        if ew not in tmp_dic:
                            tmp_dic[ew] = defaultdict(list)
                        tmp_dic[ew]['e_same'].append(lw)
            cnt += 1

    for i, l in tqdm(enumerate((Path(data_dir) / 'train.json').open()),
                     desc='train_data 2'):
        l = json.loads(l)
        ews = [k for k, _, _ in match2(l['text'])]
        lws = [m['mention'] for m in l['mention_data']]
        for w in ews:
            if w in tmp_dic:
                tmp_dic[w]['group_exp'].append(1)
        for w in lws:
            if w in tmp_dic:
                tmp_dic[w]['group_labeled'].append(1)
    tmp_sum_dic = defaultdict(dict)
    for w in tqdm(tmp_dic, desc='tmp_sum_dic'):
        tmp_sum_dic[w]['group_exp_cnt'] = len(tmp_dic[w]['group_exp'])
        tmp_sum_dic[w]['group_labeled_cnt'] = len(tmp_dic[w]['group_labeled'])
        tmp_sum_dic[w]['s_same_cnt'] = len(tmp_dic[w]['s_same'])
        tmp_sum_dic[w]['e_same_cnt'] = len(tmp_dic[w]['e_same'])
        tmp_sum_dic[w]['group_labeled_per'] = tmp_sum_dic[w][
            'group_labeled_cnt'] / tmp_sum_dic[w]['group_exp_cnt']
        tmp_sum_dic[w]['s_same_per'] = tmp_sum_dic[w][
            's_same_cnt'] / tmp_sum_dic[w]['group_exp_cnt']
        tmp_sum_dic[w]['e_same_per'] = tmp_sum_dic[w][
            'e_same_cnt'] / tmp_sum_dic[w]['group_exp_cnt']

    json.dump(tmp_sum_dic, tmp_p, ensure_ascii=False)