示例#1
0
    def __init__(self, label_list=None, path=None, padding=None, unknown=None, bert_model='bert-base-cased',
                 max_length=256, trigger_label_list=None, argument_label_list=None):
        super().__init__(label_list, path, padding=padding, unknown=unknown, bert_model=bert_model,
                         max_length=max_length)
        self.trigger_vocabulary = Vocabulary(padding=padding)
        self.trigger_vocabulary.add_word_lst(trigger_label_list)
        self.trigger_vocabulary.build_vocab()

        self.argument_vocabulary = Vocabulary(padding=padding, unknown=unknown)
        self.argument_vocabulary.add_word_lst(argument_label_list)
        self.argument_vocabulary.build_vocab()
示例#2
0
文件: ufet.py 项目: jinzhuoran/CogIE
 def load_vocabulary(self, path):
     file = os.path.join(path, 'ufet_types.txt')
     self.vocabulary = Vocabulary()
     self.vocabulary._word2idx = load_vocab_dict(file)
     self.vocabulary._idx2word = {
         v: k
         for k, v in self.vocabulary._word2idx.items()
     }
示例#3
0
    def __init__(self,
                 node_types_label_list=None,
                 node_attrs_label_list=None,
                 p2p_edges_label_list=None,
                 p2r_edges_label_list=None,
                 path=None,bert_model='bert-base-cased',max_span_width = 15, max_length=128):
        self.path = path
        self.bert_model = bert_model
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained(bert_model)
        self.max_span_width = max_span_width
        self._ontology = FrameOntology(self.path)


        if node_types_label_list:
            self.node_types_vocabulary = Vocabulary(padding="O", unknown=None)
            self.node_types_vocabulary.add_word_lst(node_types_label_list)
            self.node_types_vocabulary.build_vocab()
            self.node_types_vocabulary.save(os.path.join(path, 'node_types_vocabulary.txt'))
        else:
            self.node_types_vocabulary = Vocabulary.load(os.path.join(path, 'node_types_vocabulary.txt'))

        if node_attrs_label_list:
            self.node_attrs_vocabulary = Vocabulary(padding="O", unknown=None)
            self.node_attrs_vocabulary.add_word_lst(node_attrs_label_list)
            self.node_attrs_vocabulary.build_vocab()
            self.node_attrs_vocabulary.save(os.path.join(path, 'node_attrs_vocabulary.txt'))
        else:
            self.node_attrs_vocabulary = Vocabulary.load(os.path.join(path, 'node_attrs_vocabulary.txt'))

        if p2p_edges_label_list:
            self.p2p_edges_vocabulary = Vocabulary(padding=None, unknown=None)
            self.p2p_edges_vocabulary.add_word_lst(p2p_edges_label_list)
            self.p2p_edges_vocabulary.build_vocab()
            self.p2p_edges_vocabulary.save(os.path.join(path, 'p2p_edges_vocabulary.txt'))
        else:
            self.p2p_edges_vocabulary = Vocabulary.load(os.path.join(path, 'p2p_edges_vocabulary.txt'))

        if p2r_edges_label_list:
            self.p2r_edges_vocabulary = Vocabulary(padding=None, unknown=None)
            self.p2r_edges_vocabulary.add_word_lst(p2r_edges_label_list)
            self.p2r_edges_vocabulary.build_vocab()
            self.p2r_edges_vocabulary.save(os.path.join(path, 'p2r_edges_vocabulary.txt'))
        else:
            self.p2r_edges_vocabulary = Vocabulary.load(os.path.join(path, 'p2r_edges_vocabulary.txt'))
示例#4
0
    def __init__(self,
                 label_list=None,
                 path=None,
                 padding='<pad>',
                 unknown='<unk>',
                 bert_model='bert-base-cased',
                 max_length=256):
        self.path = path
        self.max_length = max_length
        self.bert_model = bert_model
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)

        if label_list:
            self.vocabulary = Vocabulary(padding=padding, unknown=unknown)
            self.vocabulary.add_word_lst(label_list)
            self.vocabulary.build_vocab()
            self.save_vocabulary(self.path)
        else:
            self.load_vocabulary(self.path)
示例#5
0
    def __init__(self,
                 schema_path=None,
                 trigger_path=None,
                 argument_path=None,
                 bert_model='bert-base-cased',
                 max_length=128):
        self.schema_path = schema_path
        self.trigger_path = trigger_path
        self.argument_path = argument_path
        self.bert_model = bert_model
        self.max_length = max_length

        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)
        with open(self.schema_path, 'r', encoding='utf-8') as f:
            self.schema_str = json.load(f)

        self.trigger_type_list = list()
        self.argument_type_list = list()
        trigger_type_set = set()
        argument_type_set = set()
        for trigger_type, argument_type_list in self.schema_str.items():
            trigger_type_set.add(trigger_type)
            for argument_type in argument_type_list:
                argument_type_set.add(argument_type)
        self.trigger_type_list = list(trigger_type_set)
        self.argument_type_list = list(argument_type_set)

        self.args_s_id = {}
        self.args_e_id = {}
        for i in range(len(self.argument_type_list)):
            s = self.argument_type_list[i] + '_s'
            self.args_s_id[s] = i
            e = self.argument_type_list[i] + '_e'
            self.args_e_id[e] = i

        if os.path.exists(self.trigger_path):
            self.trigger_vocabulary = Vocabulary.load(self.trigger_path)
        else:
            self.trigger_vocabulary = Vocabulary(padding=None, unknown=None)
            self.trigger_vocabulary.add_word_lst(self.trigger_type_list)
            self.trigger_vocabulary.build_vocab()
            self.trigger_vocabulary.save(self.trigger_path)
        if os.path.exists(self.argument_path):
            self.argument_vocabulary = Vocabulary.load(self.argument_path)
        else:
            self.argument_vocabulary = Vocabulary(padding=None, unknown=None)
            self.argument_vocabulary.add_word_lst(self.argument_type_list)
            self.argument_vocabulary.build_vocab()
            self.argument_vocabulary.save(self.argument_path)

        self.schema_id = {}
        for trigger_type, argument_type_list in self.schema_str.items():
            self.schema_id[self.trigger_vocabulary.word2idx[trigger_type]] = [
                self.argument_vocabulary.word2idx[a]
                for a in argument_type_list
            ]
        self.trigger_type_num = len(self.trigger_vocabulary)
        self.argument_type_num = len(self.argument_vocabulary)
        self.trigger_max_span_len = {}
        self.argument_max_span_len = {}
        for name in self.trigger_vocabulary.word2idx:
            self.trigger_max_span_len[name] = 1
        for name in self.argument_vocabulary.word2idx:
            self.argument_max_span_len[name] = 1
示例#6
0
    def __init__(self,
                 schema_path=None,
                 trigger_path=None,
                 argument_path=None,
                 bert_model='bert-base-chinese',
                 max_length=128):
        self.schema_path = schema_path
        self.trigger_path = trigger_path
        self.argument_path = argument_path
        self.bert_model = bert_model
        self.max_length = max_length

        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)
        with open(self.schema_path, 'r', encoding='utf-8') as f:
            self.schema_str = json.load(f)

        self.trigger_type_list = list()
        self.argument_type_list = list()
        trigger_type_set = set()
        argument_type_set = set()
        for trigger_type, argument_type_list in self.schema_str.items():
            trigger_type_set.add(trigger_type)
            for argument_type in argument_type_list:
                argument_type_set.add(argument_type)
        self.trigger_type_list = list(trigger_type_set)
        self.argument_type_list = list(argument_type_set)

        self.args_s_id = {}
        self.args_e_id = {}
        for i in range(len(self.argument_type_list)):
            s = self.argument_type_list[i] + '_s'
            self.args_s_id[s] = i
            e = self.argument_type_list[i] + '_e'
            self.args_e_id[e] = i

        # if os.path.exists(self.trigger_path):
        #     self.trigger_vocabulary = Vocabulary.load(self.trigger_path)
        # else:
        self.trigger_vocabulary = Vocabulary(padding=None, unknown=None)
        self.trigger_vocabulary.add_word_lst(
            ['质押', '股份股权转让', '投资', '减持', '起诉', '收购', '判决', '签署合同', '担保', '中标'])
        self.trigger_vocabulary.build_vocab()
        self.trigger_vocabulary.save(self.trigger_path)
        # if os.path.exists(self.argument_path):
        #     self.argument_vocabulary = Vocabulary.load(self.argument_path)
        # else:
        self.argument_vocabulary = Vocabulary(padding=None, unknown=None)
        self.argument_vocabulary.add_word_lst([
            'collateral', 'obj-per', 'sub-per', 'sub-org', 'share-per',
            'title', 'way', 'money', 'obj-org', 'number', 'amount',
            'proportion', 'target-company', 'date', 'sub', 'share-org', 'obj',
            'institution'
        ])
        self.argument_vocabulary.build_vocab()
        self.argument_vocabulary.save(self.argument_path)

        self.schema_id = {}
        for trigger_type, argument_type_list in self.schema_str.items():
            self.schema_id[self.trigger_vocabulary.word2idx[trigger_type]] = [
                self.argument_vocabulary.word2idx[a]
                for a in argument_type_list
            ]
        self.trigger_type_num = len(self.trigger_vocabulary)
        self.argument_type_num = len(self.argument_vocabulary)
        self.trigger_max_span_len = {}
        self.argument_max_span_len = {}
        for name in self.trigger_vocabulary.word2idx:
            self.trigger_max_span_len[name] = 1
        for name in self.argument_vocabulary.word2idx:
            self.argument_max_span_len[name] = 1