def __init__(self, label_list=None, path=None, padding=None, unknown=None, bert_model='bert-base-cased', max_length=256, trigger_label_list=None, argument_label_list=None): super().__init__(label_list, path, padding=padding, unknown=unknown, bert_model=bert_model, max_length=max_length) self.trigger_vocabulary = Vocabulary(padding=padding) self.trigger_vocabulary.add_word_lst(trigger_label_list) self.trigger_vocabulary.build_vocab() self.argument_vocabulary = Vocabulary(padding=padding, unknown=unknown) self.argument_vocabulary.add_word_lst(argument_label_list) self.argument_vocabulary.build_vocab()
def load_vocabulary(self, path): file = os.path.join(path, 'ufet_types.txt') self.vocabulary = Vocabulary() self.vocabulary._word2idx = load_vocab_dict(file) self.vocabulary._idx2word = { v: k for k, v in self.vocabulary._word2idx.items() }
def __init__(self, node_types_label_list=None, node_attrs_label_list=None, p2p_edges_label_list=None, p2r_edges_label_list=None, path=None,bert_model='bert-base-cased',max_span_width = 15, max_length=128): self.path = path self.bert_model = bert_model self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(bert_model) self.max_span_width = max_span_width self._ontology = FrameOntology(self.path) if node_types_label_list: self.node_types_vocabulary = Vocabulary(padding="O", unknown=None) self.node_types_vocabulary.add_word_lst(node_types_label_list) self.node_types_vocabulary.build_vocab() self.node_types_vocabulary.save(os.path.join(path, 'node_types_vocabulary.txt')) else: self.node_types_vocabulary = Vocabulary.load(os.path.join(path, 'node_types_vocabulary.txt')) if node_attrs_label_list: self.node_attrs_vocabulary = Vocabulary(padding="O", unknown=None) self.node_attrs_vocabulary.add_word_lst(node_attrs_label_list) self.node_attrs_vocabulary.build_vocab() self.node_attrs_vocabulary.save(os.path.join(path, 'node_attrs_vocabulary.txt')) else: self.node_attrs_vocabulary = Vocabulary.load(os.path.join(path, 'node_attrs_vocabulary.txt')) if p2p_edges_label_list: self.p2p_edges_vocabulary = Vocabulary(padding=None, unknown=None) self.p2p_edges_vocabulary.add_word_lst(p2p_edges_label_list) self.p2p_edges_vocabulary.build_vocab() self.p2p_edges_vocabulary.save(os.path.join(path, 'p2p_edges_vocabulary.txt')) else: self.p2p_edges_vocabulary = Vocabulary.load(os.path.join(path, 'p2p_edges_vocabulary.txt')) if p2r_edges_label_list: self.p2r_edges_vocabulary = Vocabulary(padding=None, unknown=None) self.p2r_edges_vocabulary.add_word_lst(p2r_edges_label_list) self.p2r_edges_vocabulary.build_vocab() self.p2r_edges_vocabulary.save(os.path.join(path, 'p2r_edges_vocabulary.txt')) else: self.p2r_edges_vocabulary = Vocabulary.load(os.path.join(path, 'p2r_edges_vocabulary.txt'))
def __init__(self, label_list=None, path=None, padding='<pad>', unknown='<unk>', bert_model='bert-base-cased', max_length=256): self.path = path self.max_length = max_length self.bert_model = bert_model self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) if label_list: self.vocabulary = Vocabulary(padding=padding, unknown=unknown) self.vocabulary.add_word_lst(label_list) self.vocabulary.build_vocab() self.save_vocabulary(self.path) else: self.load_vocabulary(self.path)
def __init__(self, schema_path=None, trigger_path=None, argument_path=None, bert_model='bert-base-cased', max_length=128): self.schema_path = schema_path self.trigger_path = trigger_path self.argument_path = argument_path self.bert_model = bert_model self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) with open(self.schema_path, 'r', encoding='utf-8') as f: self.schema_str = json.load(f) self.trigger_type_list = list() self.argument_type_list = list() trigger_type_set = set() argument_type_set = set() for trigger_type, argument_type_list in self.schema_str.items(): trigger_type_set.add(trigger_type) for argument_type in argument_type_list: argument_type_set.add(argument_type) self.trigger_type_list = list(trigger_type_set) self.argument_type_list = list(argument_type_set) self.args_s_id = {} self.args_e_id = {} for i in range(len(self.argument_type_list)): s = self.argument_type_list[i] + '_s' self.args_s_id[s] = i e = self.argument_type_list[i] + '_e' self.args_e_id[e] = i if os.path.exists(self.trigger_path): self.trigger_vocabulary = Vocabulary.load(self.trigger_path) else: self.trigger_vocabulary = Vocabulary(padding=None, unknown=None) self.trigger_vocabulary.add_word_lst(self.trigger_type_list) self.trigger_vocabulary.build_vocab() self.trigger_vocabulary.save(self.trigger_path) if os.path.exists(self.argument_path): self.argument_vocabulary = Vocabulary.load(self.argument_path) else: self.argument_vocabulary = Vocabulary(padding=None, unknown=None) self.argument_vocabulary.add_word_lst(self.argument_type_list) self.argument_vocabulary.build_vocab() self.argument_vocabulary.save(self.argument_path) self.schema_id = {} for trigger_type, argument_type_list in self.schema_str.items(): self.schema_id[self.trigger_vocabulary.word2idx[trigger_type]] = [ self.argument_vocabulary.word2idx[a] for a in argument_type_list ] self.trigger_type_num = len(self.trigger_vocabulary) self.argument_type_num = len(self.argument_vocabulary) self.trigger_max_span_len = {} self.argument_max_span_len = {} for name in self.trigger_vocabulary.word2idx: self.trigger_max_span_len[name] = 1 for name in self.argument_vocabulary.word2idx: self.argument_max_span_len[name] = 1
def __init__(self, schema_path=None, trigger_path=None, argument_path=None, bert_model='bert-base-chinese', max_length=128): self.schema_path = schema_path self.trigger_path = trigger_path self.argument_path = argument_path self.bert_model = bert_model self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) with open(self.schema_path, 'r', encoding='utf-8') as f: self.schema_str = json.load(f) self.trigger_type_list = list() self.argument_type_list = list() trigger_type_set = set() argument_type_set = set() for trigger_type, argument_type_list in self.schema_str.items(): trigger_type_set.add(trigger_type) for argument_type in argument_type_list: argument_type_set.add(argument_type) self.trigger_type_list = list(trigger_type_set) self.argument_type_list = list(argument_type_set) self.args_s_id = {} self.args_e_id = {} for i in range(len(self.argument_type_list)): s = self.argument_type_list[i] + '_s' self.args_s_id[s] = i e = self.argument_type_list[i] + '_e' self.args_e_id[e] = i # if os.path.exists(self.trigger_path): # self.trigger_vocabulary = Vocabulary.load(self.trigger_path) # else: self.trigger_vocabulary = Vocabulary(padding=None, unknown=None) self.trigger_vocabulary.add_word_lst( ['质押', '股份股权转让', '投资', '减持', '起诉', '收购', '判决', '签署合同', '担保', '中标']) self.trigger_vocabulary.build_vocab() self.trigger_vocabulary.save(self.trigger_path) # if os.path.exists(self.argument_path): # self.argument_vocabulary = Vocabulary.load(self.argument_path) # else: self.argument_vocabulary = Vocabulary(padding=None, unknown=None) self.argument_vocabulary.add_word_lst([ 'collateral', 'obj-per', 'sub-per', 'sub-org', 'share-per', 'title', 'way', 'money', 'obj-org', 'number', 'amount', 'proportion', 'target-company', 'date', 'sub', 'share-org', 'obj', 'institution' ]) self.argument_vocabulary.build_vocab() self.argument_vocabulary.save(self.argument_path) self.schema_id = {} for trigger_type, argument_type_list in self.schema_str.items(): self.schema_id[self.trigger_vocabulary.word2idx[trigger_type]] = [ self.argument_vocabulary.word2idx[a] for a in argument_type_list ] self.trigger_type_num = len(self.trigger_vocabulary) self.argument_type_num = len(self.argument_vocabulary) self.trigger_max_span_len = {} self.argument_max_span_len = {} for name in self.trigger_vocabulary.word2idx: self.trigger_max_span_len[name] = 1 for name in self.argument_vocabulary.word2idx: self.argument_max_span_len[name] = 1