예제 #1
0
    def __init__(self, path, args):
        """Inits tokenized sentence and positive pair for MTB.
        
        Args:
            path: path to your dataset.
            args: args from command line.
        
        Returns:
            No returns
        
        Raises:
            If the dataset in `path` is not the same format as described in 
            file 'prepare_data.py', there may raise:
                - `key nor found`
                - `integer can't be indexed`
                and so on.
        """
        self.path = path
        self.args = args
        data = json.load(open(os.path.join(path, "mtbdata.json")))
        entityMarker = EntityMarker()

        # Important Configures
        tot_sentence = len(data)

        # Converts tokens to ids and meanwhile `BLANK` some entities randomly.
        self.tokens = np.zeros((tot_sentence, args.max_length), dtype=int)
        self.mask = np.zeros((tot_sentence, args.max_length), dtype=int)
        self.h_pos = np.zeros((tot_sentence), dtype=int)
        self.t_pos = np.zeros((tot_sentence), dtype=int)
        for i, sentence in enumerate(data):
            h_flag = random.random() > args.alpha
            t_flag = random.random() > args.alpha
            h_p = sentence["h"]["pos"][0]
            t_p = sentence["t"]["pos"][0]
            ids, ph, pt = entityMarker.tokenize(sentence["tokens"],
                                                [h_p[0], h_p[-1] + 1],
                                                [t_p[0], t_p[-1] + 1], None,
                                                None, h_flag, t_flag)
            length = min(len(ids), args.max_length)
            self.tokens[i][0:length] = ids[0:length]
            self.mask[i][0:length] = 1
            self.h_pos[i] = min(args.max_length - 1, ph)
            self.t_pos[i] = min(args.max_length - 1, pt)
        print(
            "The number of sentence in which tokenizer can't find head/tail entity is %d"
            % entityMarker.err)

        entpair2scope = json.load(
            open(os.path.join(path, "entpair2scope.json")))
        entpair2negpair = json.load(
            open(os.path.join(path, "entpair2negpair.json")))
        self.pos_pair = []

        # Generates all positive pair.
        for key in entpair2scope.keys():
            self.pos_pair.extend(self.__pos_pair__(entpair2scope[key]))
        print("Positive pairs' number is %d" % len(self.pos_pair))
        # Samples negative pairs dynamically.
        self.__sample__()
예제 #2
0
    def __init__(self, path, args):
        """Inits tokenized sentence and positive pair for CP.
        
        Args:
            path: path to your dataset.
            args: args from command line.
        
        Returns:
            No returns
        
        Raises:
            If the dataset in `path` is not the same format as described in 
            file 'prepare_data.py', there may raise:
                - `key nor found`
                - `integer can't be indexed`
                and so on.
        """
        self.path = path
        self.args = args
        data = json.load(open(os.path.join(path, "cpdata_dedup.json")))
        rel2scope = json.load(open(os.path.join(path, "rel2scope.json")))
        entityMarker = EntityMarker(args)

        self.tokens = np.zeros((len(data), args.max_length), dtype=int)
        self.mask = np.zeros((len(data), args.max_length), dtype=int)
        self.label = np.zeros((len(data)), dtype=int)
        self.h_pos = np.zeros((len(data)), dtype=int)
        self.t_pos = np.zeros((len(data)), dtype=int)

        # Distant supervised label for sentence.
        # Sentences whose label are the same in a batch
        # is positive pair, otherwise negative pair.
        for i, rel in tqdm(enumerate(rel2scope.keys()), "relation types"):
            scope = rel2scope[rel]
            for j in range(scope[0], scope[1]):
                self.label[j] = i

        for i, sentence in tqdm(enumerate(data), "sentences"):
            h_flag = random.random() > args.alpha
            t_flag = random.random() > args.alpha
            h_p = sentence["h"]["pos"][0]
            t_p = sentence["t"]["pos"][0]
            ids, ph, pt = entityMarker.tokenize(sentence["tokens"],
                                                [h_p[0], h_p[-1] + 1],
                                                [t_p[0], t_p[-1] + 1], None,
                                                None, h_flag, t_flag)
            length = min(len(ids), args.max_length)
            self.tokens[i][:length] = ids[:length]
            self.mask[i][:length] = 1
            self.h_pos[i] = min(args.max_length - 1, ph)
            self.t_pos[i] = min(args.max_length - 1, pt)
        print(
            "The number of sentence in which tokenizer can't find head/tail entity is %d"
            % entityMarker.err)
        # Samples positive pair dynamically.
        self.__sample__()
예제 #3
0
    def __init__(self, path, mode, args):
        data = []
        with open(os.path.join(path, mode)) as f:
            all_lines = f.readlines()
            for line in all_lines:
                ins = json.loads(line)
                data.append(ins)

        entityMarker = EntityMarker(args)
        tot_instance = len(data)

        # load rel2id and type2id
        if os.path.exists(os.path.join(path, "rel2id.json")):
            rel2id = json.load(open(os.path.join(path, "rel2id.json")))
        else:
            raise Exception("Error: There is no `rel2id.json` in " + path +
                            ".")
        if os.path.exists(os.path.join(path, "type2id.json")):
            type2id = json.load(open(os.path.join(path, "type2id.json")))
        else:
            print(
                "Warning: There is no `type2id.json` in " + path +
                ", If you want to train model using `OT`, `CT` settings, please firstly run `utils.py` to get `type2id.json`."
            )

        print("开始预处理文件 " + mode)
        # pre process data
        self.input_ids = np.zeros((tot_instance, args.max_length), dtype=int)
        self.mask = np.zeros((tot_instance, args.max_length), dtype=int)
        self.h_pos = np.zeros((tot_instance), dtype=int)
        self.t_pos = np.zeros((tot_instance), dtype=int)
        self.label = np.zeros((tot_instance), dtype=int)
        print_num = 5
        for i, ins in enumerate(data):
            self.label[i] = rel2id[ins["relation"]]
            # tokenize, 上下文+实体提及
            if args.mode == "CM":
                ids, ph, pt = entityMarker.tokenize(data[i]["token"],
                                                    data[i]['h']['pos'],
                                                    data[i]['t']['pos'])
            elif args.mode == "OC":
                ids, ph, pt = entityMarker.tokenize(data[i]["token"],
                                                    data[i]['h']['pos'],
                                                    data[i]['t']['pos'], None,
                                                    None, True, True)
            elif args.mode == "CT":
                h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] +
                                         10)
                t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] +
                                         10)
                ids, ph, pt = entityMarker.tokenize(data[i]["token"],
                                                    data[i]['h']['pos'],
                                                    data[i]['t']['pos'],
                                                    h_type, t_type)
            elif args.mode == "OM":
                head = entityMarker.tokenizer.tokenize(ins['h']['name'])
                tail = entityMarker.tokenizer.tokenize(ins['t']['name'])
                h_first = ins['h']['pos'][0] < ins['t']['pos'][0]
                ids, ph, pt = entityMarker.tokenize_OMOT(head, tail, h_first)
            elif args.mode == "OT":
                h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] +
                                         10)
                t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] +
                                         10)
                h_first = ins['h']['pos'][0] < ins['t']['pos'][0]
                ids, ph, pt = entityMarker.tokenize_OMOT([
                    h_type,
                ], [
                    t_type,
                ], h_first)
            else:
                raise Exception(
                    "No such mode! Please make sure that `mode` takes the value in {CM,OC,CT,OM,OT}"
                )

            length = min(len(ids), args.max_length)
            self.input_ids[i][0:length] = ids[0:length]
            self.mask[i][0:length] = 1
            self.h_pos[i] = min(ph, args.max_length - 1)
            self.t_pos[i] = min(pt, args.max_length - 1)
            #打印前几个元素样本示例
            if i < print_num:
                print(f"打印第{i+1}个样本")
                print(f'样本的tokens {data[i]["token"]}')
                print(f"样本的inputs_ids{self.input_ids[i]}")
                print(
                    '输入样本的模式是: .... "[unused0] " + 实体1 + " [unused1] .... [unused2] " + 实体2 + " [unused3]"...'
                )
                print(
                    f"样本的inputs_ids到tokens{entityMarker.tokenizer.convert_ids_to_tokens(self.input_ids[i])}"
                )
                print(f"样本的mask {self.mask[i]}")
                print(f"样本的第一个实体 {data[i]['h']['name']}")
                print(
                    f"样本的第一个实体的位置, 输入到模型中的位置,这个[unused0]位置用来预测关系 {self.h_pos[i]}"
                )
                print(f"样本的第二个实体 {data[i]['t']['name']}")
                print(
                    f"样本的第二个实体的位置,输入到模型中的位置,这个[unused2]位置用来预测关系 {self.t_pos[i]}"
                )
                print(f"样本的的标签labelid {self.label[i]}")
                print(f"样本的的标签label {data[i]['relation']}")
                print()
        print("tokenizer无法找到头/尾实体的句子数量为 %d" % entityMarker.err)
예제 #4
0
    def __init__(self, path, mode, args):
        data = []
        with open(os.path.join(path, mode)) as f:
            all_lines = f.readlines()
            for line in all_lines:
                ins = json.loads(line)
                data.append(ins)

        entityMarker = EntityMarker(args)
        tot_instance = len(data)

        # load rel2id and type2id
        if os.path.exists(os.path.join(path, "rel2id.json")):
            rel2id = json.load(open(os.path.join(path, "rel2id.json")))
        else:
            raise Exception("Error: There is no `rel2id.json` in " + path +
                            ".")
        if os.path.exists(os.path.join(path, "type2id.json")):
            type2id = json.load(open(os.path.join(path, "type2id.json")))
        else:
            print(
                "Warning: There is no `type2id.json` in " + path +
                ", If you want to train model using `OT`, `CT` settings, please firstly run `utils.py` to get `type2id.json`."
            )

        print("pre process " + mode)
        # pre process data
        self.input_ids = np.zeros((tot_instance, args.max_length), dtype=int)
        self.mask = np.zeros((tot_instance, args.max_length), dtype=int)
        self.h_pos = np.zeros((tot_instance), dtype=int)
        self.t_pos = np.zeros((tot_instance), dtype=int)
        self.label = np.zeros((tot_instance), dtype=int)

        for i, ins in enumerate(data):
            self.label[i] = rel2id[ins["relation"]]
            # tokenize
            if args.mode == "CM":
                ids, ph, pt = entityMarker.tokenize(data[i]["token"],
                                                    data[i]['h']['pos'],
                                                    data[i]['t']['pos'])
            elif args.mode == "OC":
                ids, ph, pt = entityMarker.tokenize(data[i]["token"],
                                                    data[i]['h']['pos'],
                                                    data[i]['t']['pos'], None,
                                                    None, True, True)
            elif args.mode == "CT":
                h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] +
                                         10)
                t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] +
                                         10)
                ids, ph, pt = entityMarker.tokenize(data[i]["token"],
                                                    data[i]['h']['pos'],
                                                    data[i]['t']['pos'],
                                                    h_type, t_type)
            elif args.mode == "OM":
                head = entityMarker.tokenizer.tokenize(ins['h']['name'])
                tail = entityMarker.tokenizer.tokenize(ins['t']['name'])
                h_first = ins['h']['pos'][0] < ins['t']['pos'][0]
                ids, ph, pt = entityMarker.tokenize_OMOT(head, tail, h_first)
            elif args.mode == "OT":
                h_type = "[unused%d]" % (type2id['subj_' + ins['h']['type']] +
                                         10)
                t_type = "[unused%d]" % (type2id['obj_' + ins['t']['type']] +
                                         10)
                h_first = ins['h']['pos'][0] < ins['t']['pos'][0]
                ids, ph, pt = entityMarker.tokenize_OMOT([
                    h_type,
                ], [
                    t_type,
                ], h_first)
            else:
                raise Exception(
                    "No such mode! Please make sure that `mode` takes the value in {CM,OC,CT,OM,OT}"
                )

            length = min(len(ids), args.max_length)
            self.input_ids[i][0:length] = ids[0:length]
            self.mask[i][0:length] = 1
            self.h_pos[i] = min(ph, args.max_length - 1)
            self.t_pos[i] = min(pt, args.max_length - 1)
        print(
            "The number of sentence in which tokenizer can't find head/tail entity is %d"
            % entityMarker.err)
예제 #5
0
    def __init__(self, path, args):
        """
        初始化MTB的tokenized sentence和positive pair。
        Args:
            path: path to your dataset.
            args: args from command line.
        
        Returns:
            No returns
        
        Raises:
            If the dataset in `path` is not the same format as described in 
            file 'prepare_data.py', there may raise:
                - `key nor found`
                - `integer can't be indexed`
                and so on.
        """
        self.path = path
        self.args = args
        data = json.load(open(os.path.join(path, "mtbdata.json")))
        # 将原始文本转换为BERT输入的ID,并找到实体位置。
        entityMarker = EntityMarker()

        # Important Configures, 句子总数
        tot_sentence = len(data)

        # 将token转换为ID,同时将某些实体随机化为“BLANK”。
        # 初始化 tokens, mask , h_pos, t_pos
        self.tokens = np.zeros((tot_sentence, args.max_length), dtype=int)
        self.mask = np.zeros((tot_sentence, args.max_length), dtype=int)
        self.h_pos = np.zeros((tot_sentence), dtype=int)
        self.t_pos = np.zeros((tot_sentence), dtype=int)
        #迭代数据
        for i, sentence in enumerate(data):
            # token被替换成BLANK的概率
            h_flag = random.random() > args.alpha
            t_flag = random.random() > args.alpha
            # 实体1和实体2的位置, eg: h_p: [8, 9, 10, 11]
            h_p = sentence["h"]["pos"][0]
            t_p = sentence["t"]["pos"][0]
            # 将原始文本转换为BERT输入的ID,并找到实体位置。ids是句子tokenizer后的id,ph是 头实体位置(头实体标记的起始位置)。这里是[unused0]的位置,
            # pt 是尾部实体位置(尾部实体标记的起始位置)。这里是[unused2]的位置
            ids, ph, pt = entityMarker.tokenize(sentence["tokens"],
                                                [h_p[0], h_p[-1] + 1],
                                                [t_p[0], t_p[-1] + 1], None,
                                                None, h_flag, t_flag)
            # 为了计算mask,
            length = min(len(ids), args.max_length)
            #把默认为0的token替换成实际的tokenid,其余部分相当于padding了
            self.tokens[i][0:length] = ids[0:length]
            # 只有对应有tokenid的位置时1,其它默认为0了
            self.mask[i][0:length] = 1
            #表明第一个实体的位置,
            self.h_pos[i] = min(args.max_length - 1, ph)
            self.t_pos[i] = min(args.max_length - 1, pt)
        print("tokenizer找不到头/尾实体的句子数%d" % entityMarker.err)

        entpair2scope = json.load(
            open(os.path.join(path, "entpair2scope.json")))
        entpair2negpair = json.load(
            open(os.path.join(path, "entpair2negpair.json")))
        self.pos_pair = []

        # 生成所有正样本对。 eg: self.pos_pair: [[0, 1], [2, 3], [4, 5]]
        for key in entpair2scope.keys():
            self.pos_pair.extend(self.__pos_pair__(entpair2scope[key]))
        print("Positive pairs 数量是 %d" % len(self.pos_pair))
        # 负样本动态采样
        self.__sample__()