Exemplo n.º 1
0
def build_dataset_gpt2(train_file, type="all"):
    """
    百度训练集
    train_file 文件路径
    type="all" 或者mini 
    mini
    """
    tjson = Tjson(file_path=train_file)
    tjson_save = Tjson(file_path="data/train.json")
    dev_json_save = Tjson(file_path="data/dev.json")
    data = []
    f = open('data/gpt2kg.txt', 'a')
    for item in tqdm(tjson.load()):

        text = item['text']
        # print(text)
        # print(item['spo_list'])
        predicate = {}
        kg = " [KGS] "
        for n in item['spo_list']:
            # predicate[n['predicate']]=[]
            # print(n)
            # print(n)
            kg = kg + ' [KG] ' + n['subject'] + "," + n['predicate'] + "," + n[
                'object'] + " [/KG] "

            pass

        # data=text+str(item['spo_list'])
        data = text + kg + " [KGE] "
        print("***" * 10)
        print(data)
        f.write(data + '\n\n')
    f.close()
Exemplo n.º 2
0
def saoke():
    saoke = Tjson(file_path="data/SAOKE_DATA.json")
    i = 0
    for line in saoke.load():
        print("###" * 20)
        print(line)
        print(line['natural'])
        for logic in line['logic']:
            print(logic)
            print(logic['predicate'])
            print(logic['qualifier'])
            # print(logic['object'])
            for object in logic['object']:
                print(object)
            print(logic['place'])
            print(logic['time'])

            print(logic['subject'])
        i = i + 1
        if i > 10:
            exit()
Exemplo n.º 3
0
def build_ner(input_file, path='./', tags=None, type='all'):
    d = _read_data(input_file)
    # tjson=Tjson(file_path=train_file)
    tjson_save = Tjson(file_path=path + "train.json")
    dev_json_save = Tjson(file_path=path + "dev.json")
    data = []
    if tags == None:
        tags = {"<pad>": 1, "O": 1, "<start>": 1, "<eos>": 1}

    for item in tqdm(d):

        text = item[0]
        # print(text)
        # print(item['spo_list'])
        # predicate={}
        # for n in item['spo_list']:
        #     predicate[n['predicate']]=[]

        # print(label)
        for label in item[0]:
            tags[label] = 1

        if len(list(item[0])) == len(list(item[1])) and "M-描述" not in item[0]:
            lb = []
            for l in item[0]:
                if l.endswith("关系") or l.endswith("实体") or l.endswith("O"):
                    lb.append(l)
                elif l.endswith("属性"):
                    lb.append(l.replace("属性", '关系'))
                else:
                    lb.append("O")

            one = {"text": item[1], "label": lb}
            data.append(one)
        else:
            # print("pass")
            pass
    if type == "all":
        pass
    elif type == "mini":
        data = data[:200]
    # print(tags)
    # with open("data/tag.txt","w") as f:
    #     f.write("\n".join(tags.keys()))

    f = int(len(data) * 0.85)
    tjson_save.save(data=data[:f])
    dev_json_save.save(data=data[f:])
    return tags
Exemplo n.º 4
0
def load_data():
    file = "data/SAOKE_DATA.json"
    for line in Tjson(file_path=file).load():
        print("##" * 30)
        # print("line",line)
        print(line['natural'])
        # print(line['natural'])
        for it in line['logic']:
            print(it)
            # subject =line['natural'].find(it['subject'])
            # print(subject)
            print(find_srt(line['natural'], it['subject']))
            print(find_srt(line['natural'], it['predicate']))
            print(find_srt(line['natural'], it['object'][0]))
            print(it['subject'], it['predicate'], it['object'])
Exemplo n.º 5
0
def ner_rebulid():
    """
    将原有数据转化为标记数据
    """
    new_train = Tjson(file_path="data/train.json")
    new_dev = Tjson(file_path="data/dev.json")
    files = ["data/o/train.json", "data/o/dev.json"]
    data = []
    for file in files:
        for line in Tjson(file_path=file).load():
            # print("line",line['label'])
            new_label = {}
            for i, label in enumerate(line['label']):
                one = {}
                new_label[i] = label
                if i == 0:
                    a = {'type': "实体", 'num': []}

                if label == "B-ORG":

                    # 在为O时候处理
                    if len(a['num']) >= 2:
                        for key, i_n in enumerate(a['num']):
                            if key == 0:
                                new_label[i_n] = "B-" + a['type']
                            elif key == len(a['num']) - 1:
                                new_label[i_n] = "E-" + a['type']
                            else:
                                new_label[i_n] = "M-" + a['type']
                    elif len(a['num']) == 1:
                        new_label[a['num'][0]] = "S-" + a['type']
                    a = {'type': "实体", 'num': [i]}

                elif label == "I-ORG":
                    a['num'].append(i)
                elif label == "B-PER":

                    # 在为O时候处理
                    if len(a['num']) >= 2:
                        for key, i_n in enumerate(a['num']):
                            if key == 0:
                                new_label[i_n] = "B-" + a['type']
                            elif key == len(a['num']) - 1:
                                new_label[i_n] = "E-" + a['type']
                            else:
                                new_label[i_n] = "M-" + a['type']
                    elif len(a['num']) == 1:
                        new_label[a['num'][0]] = "S-" + a['type']
                    a = {'type': "实体", 'num': [i]}

                elif label == "I-PER":
                    a['num'].append(i)
                elif label == "B-LOC":
                    # 在为O时候处理
                    if len(a['num']) >= 2:
                        for key, i_n in enumerate(a['num']):
                            if key == 0:
                                new_label[i_n] = "B-" + a['type']
                            elif key == len(a['num']) - 1:
                                new_label[i_n] = "E-" + a['type']
                            else:
                                new_label[i_n] = "M-" + a['type']
                    elif len(a['num']) == 1:
                        new_label[a['num'][0]] = "S-" + a['type']
                    a = {'type': "地点", 'num': [i]}

                elif label == "I-LOC":
                    a['num'].append(i)
                else:
                    # 在为O时候处理
                    if len(a['num']) >= 2:
                        for key, i_n in enumerate(a['num']):
                            if key == 0:
                                new_label[i_n] = "B-" + a['type']
                            elif key == len(a['num']) - 1:
                                new_label[i_n] = "E-" + a['type']
                            else:
                                new_label[i_n] = "M-" + a['type']
                    elif len(a['num']) == 1:
                        new_label[a['num'][0]] = "S-" + a['type']

                    # a={'type':"实体",'num':[i]}
            labels = []
            # print(new_label)
            tags = {}
            for l in new_label:
                labels.append(new_label[l])
                # print(new_label[l])
                tags[new_label[l]] = 0
            if len(tags) > 1:
                one = {"text": line["text"], "label": labels}
                # print(one)
                data.append(one)
    f = int(len(data) * 0.85)
    new_train.save(data[:f])
    new_dev.save(data[f:])
Exemplo n.º 6
0
def build_dataset_ner(train_file, type="all"):
    """
    百度训练集
    转化为标注数据集
    实体标注和关系词抽取训练集
    train_file 文件路径
    type="all" 或者mini 
    mini

    构建数据思路
    多个描述合并到一个训练里

    使用ner提取出句子中的实体

    文本: ner+句子
    label: ['K']*len(ner)+正常标记
    """
    tjson = Tjson(file_path=train_file)
    all_save = Tjson(file_path="data/train_all.json")
    # tjson_save=Tjson(file_path="data/ner_train.json")
    # dev_json_save=Tjson(file_path="data/ner_dev.json")
    tjson_save = Tjson(file_path="data/ner_train.json")
    dev_json_save = Tjson(file_path="data/ner_dev.json")
    data = []

    for item in tqdm(tjson.load()):
        text = item['text']
        label = ["O"] * len(text)
        ner = {}
        for n in item['spo_list']:
            try:
                ner[n['subject']].append(n['predicate'])
            except:
                ner[n['subject']] = [n['predicate']]
        for nr in ner:
            s = 0
            for n in ner[nr]:
                label, s1 = mark_word_label(text, label, n, "关系")
                if s1 >= 0:
                    s = s + 1
            if s > 0:
                one = {
                    'text': list(nr + '#' + text),
                    'label': ['K'] * len(nr) + ['X'] + label
                }
                data.append(one)
                # print(one)

    if type == "all":
        pass
    elif type == "mini":
        data = data[:200]
    # all_save.save(data)
    print("总共数据", len(data))
    f = int(len(data) * 0.85)
    tjson_save.save(data=data[:f])
    dev_json_save.save(data=data[f:])
Exemplo n.º 7
0
def build_dataset_kg_check(train_file, type="all"):
    """
    百度训练集转化为判断抽取知识是否是合理的
    """
    tjson = Tjson(file_path=train_file)
    # all_save=Tjson(file_path="data/train_all.json")
    tjson_save = Tjson(file_path="data/kg_check/train.json")
    dev_json_save = Tjson(file_path="data/kg_check/dev.json")
    data = []
    i = 0
    for item in tqdm(tjson.load()):
        for n in item['spo_list']:
            kg_one = [n['subject'], n['predicate'], n['object']]
            kg = ' [KG] ' + ",".join(kg_one) + " [/KG] " + item['text']
            one = {'sentence': kg, 'label': 1}
            data.append(one)
            kg_one_list = list(",".join(kg_one))
            shuffle(kg_one_list)
            # print(kg_one_list)
            if kg_one_list != list(",".join(kg_one)):
                kg = ' [KG] ' + "".join(kg_one_list) + " [/KG] " + item['text']
                one = {'sentence': kg, 'label': 0}
                data.append(one)
    # print(data[10:])
    if type == "all":
        pass
    elif type == "mini":
        data = data[:200]
    # all_save.save(data)
    f = int(len(data) * 0.85)
    tjson_save.save(data=data[:f])
    dev_json_save.save(data=data[f:])
Exemplo n.º 8
0
def build_dataset(train_file, type="all"):
    """
    百度训练集
    train_file 文件路径
    type="all" 或者mini 
    mini
    """
    tjson = Tjson(file_path=train_file)
    tjson_save = Tjson(file_path="data/train.json")
    dev_json_save = Tjson(file_path="data/dev.json")
    data = []
    for item in tqdm(tjson.load()):

        text = item['text']
        # print(text)
        # print(item['spo_list'])
        predicate = {}
        for n in item['spo_list']:
            predicate[n['predicate']] = []
        for n in item['spo_list']:
            one = {
                "subject": n['subject'],
                "object": n['object'],
            }
            predicate[n['predicate']].append(one)
        # print(predicate)
        p_n = list(range(20))
        # random.shuffle(p_n)
        label = ["O"] * len(text)
        for i, p in enumerate(predicate):
            # print('p',p)
            # print(predicate)
            # i=0
            i = 0
            # for m in predicate[p]:
            #     start_a =text.find(m['subject'])
            #     end_a=text.find(m['subject'])+len(m['subject'])
            #     for n in range(start_a,end_a):
            #         # label[n]='M_A_'+str(p_n[i])
            #         label[n]='M_A'
            #         pass
            #     start_a =text.find(m['object'])
            #     end_a=text.find(m['object'])+len(m['object'])
            #     for n in range(start_a,end_a):
            #         # label[n]='M_B_'+str(p_n[i])
            #         label[n]='M_A'
            #         pass
            start_p = text.find(p)
            end_p = text.find(p) + len(p)
            if start_p >= 0:
                for n in range(start_p, end_p):
                    # label[n]='M_P_'+str(p_n[i])
                    label[n] = 'M_P'
                    pass
        # print(label)
        if len(list(text)) == len(list(label)):
            one = {"text": list(text), "label": label}
            data.append(one)
        else:
            # print("pass")
            pass
    if type == "all":
        pass
    elif type == "mini":
        data = data[:200]

    f = int(len(data) * 0.85)
    tjson_save.save(data=data[:f])
    dev_json_save.save(data=data[f:])
Exemplo n.º 9
0
def build_dataset_kg(train_file, type="all"):
    """
    百度训练集
    转化为标注数据集
    train_file 文件路径
    type="all" 或者mini 
    mini

    构建数据思路
    多个描述合并到一个训练里

    使用ner提取出句子中的实体

    文本: ner+句子
    label: ['K']*len(ner)+正常标记
    """
    tjson = Tjson(file_path=train_file)
    all_save = Tjson(file_path="data/train_all.json")
    tjson_save = Tjson(file_path="data/train.json")
    dev_json_save = Tjson(file_path="data/dev.json")
    data = []
    i = 0
    for item in tqdm(tjson.load()):
        # i=i+1
        # if i==1000:
        #     break
        # print(item)
        text = item['text']

        # print(text)
        # print(item['spo_list'])
        predicate = {}
        for n in item['spo_list']:
            predicate[n['predicate']] = []
        kgs = {}
        # s_n=0
        for n in item['spo_list']:
            if kgs.get(n['subject']) == None:
                kgs[n['subject']] = {}

                label = ["O"] * len(text)
                # w=n['subject']
                # label,s=mark_word_label(text,label,w,"实体")

                # w=n['predicate']
                # label,s=mark_word_label(text,label,w,"关系")

                w = n['object']
                label, s = mark_word_label(text, label, w, "描述")
                kgs[n['subject']][n['predicate']] = {
                    "objects": [n['object']],
                    'label': label
                }
            elif kgs[n['subject']].get(n['predicate']) == None:

                label = ["O"] * len(text)
                # w=n['subject']
                # label,s=mark_word_label(text,label,w,"实体")

                # w=n['predicate']
                # label,s=mark_word_label(text,label,w,"关系")

                w = n['object']
                label, s = mark_word_label(text, label, w, "描述")
                kgs[n['subject']][n['predicate']] = {
                    "objects": [n['object']],
                    'label': label
                }
            else:

                label = kgs[n['subject']][n['predicate']]['label']
                # w=n['subject']
                # label,s=mark_word_label(text,label,w,"实体")

                # w=n['predicate']
                # label,s=mark_word_label(text,label,w,"关系")

                w = n['object']
                label, s = mark_word_label(text, label, w, "描述")
                kgs[n['subject']][n['predicate']]['objects'].append(
                    n['object'])
            # if s>=0:
            #     s_n=s_n+1

        # mark_one(text,kgs)
        # print(kgs)
        for ner in kgs.keys():
            for p in kgs[ner]:
                # print('####'*20)
                # print(kgs[ner][p])
                # print(text)
                # print(kgs[ner][p]['label'])
                one = {
                    "text":
                    list(ner + '#' + p + '#' + text),
                    'label':
                    len(ner) * ['K'] + ['X'] + len(p) * ['P'] + ['X'] +
                    kgs[ner][p]['label']
                }
                if len(one['text']) == len(one['label']):
                    data.append(one)
    if type == "all":
        pass
    elif type == "mini":
        data = data[:200]
    all_save.save(data)
    f = int(len(data) * 0.85)
    tjson_save.save(data=data[:f])
    dev_json_save.save(data=data[f:])
Exemplo n.º 10
0
def test():
    """
    执行预测
    """
    config = Config()
    # config.update(**kwargs)
    print('当前设置为:\n', config)
    if config.use_cuda:
        torch.cuda.set_device(config.gpu)
    print('loading corpus')
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.label_file)
    tagset_size = len(label_dic)
    # content=["柯 基 犬 是 个 小 狗 子"]
    content = list(
        "威尔士柯基犬(welsh corgi pembroke)是一种小型犬,它们的胆子很大,也相当机警,能高度警惕地守护家园,是最受欢迎的小型护卫犬之一。"
    )
    content = " ".join(content)

    dev_json_save = Tjson(file_path="data/dev.json")
    data = []
    for item in dev_json_save.load():
        print("#########" * 5)
        content = " ".join(item['text'])
        print(content)
        print(item['label'])
        input_data = build_input(content=[content],
                                 max_length=config.max_length,
                                 vocab=vocab)

        input_ids = torch.LongTensor([temp.input_id for temp in input_data])
        input_masks = torch.LongTensor(
            [temp.input_mask for temp in input_data])

        input_dataset = TensorDataset(input_ids, input_masks)
        input_loader = DataLoader(input_dataset,
                                  shuffle=True,
                                  batch_size=config.batch_size)

        model = BERT_LSTM_CRF(config.bert_path,
                              tagset_size,
                              config.bert_embedding,
                              config.rnn_hidden,
                              config.rnn_layer,
                              dropout_ratio=config.dropout_ratio,
                              dropout1=config.dropout1,
                              use_cuda=config.use_cuda)
        if config.load_model:
            assert config.load_path is not None
            #
            model = load_model(model, name=config.load_path)
        # model = load_model(model, name='result/pytorch_model.bin')
        if config.use_cuda:
            model.cuda()
        # model.train()
        for i, batch in enumerate(input_loader):
            inputs, masks = batch
            # print('inputs',inputs)
            inputs, masks = Variable(inputs), Variable(masks)
            # print("masks",masks)
            if config.use_cuda:
                inputs, masks = inputs.cuda(), masks.cuda()
            feats = model(inputs)
            # print("feats",feats)
            path_score, best_path = model.crf(feats, masks.bool())
            print("feats", path_score, best_path)
            for item in best_path.numpy():
                # print(item.tolist())
                words = []
                for i, id in enumerate(item.tolist()):
                    word_id = inputs.numpy().tolist()[0][i]
                    words.append((list(vocab)[word_id], list(label_dic)[id]))
                print('words', words)