def __init__(self, info_dict, trained_model_path=None): self.info_dict = info_dict self.bertinizer = SentenceBERTinizer() self.model = BERTGraphRel( num_ne=info_dict["entity_vsize"], num_rel=info_dict["rel_vsize"], embedding_size=self.bertinizer.embedding_size) if trained_model_path is not None: self.load_trained_model(trained_model_path)
class BERTGraphRelExtractor(object): def __init__(self, info_dict, trained_model_path=None): self.info_dict = info_dict self.bertinizer = SentenceBERTinizer() self.model = BERTGraphRel( num_ne=info_dict["entity_vsize"], num_rel=info_dict["rel_vsize"], embedding_size=self.bertinizer.embedding_size) if trained_model_path is not None: self.load_trained_model(trained_model_path) def load_trained_model(self, path): self.model.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) print("Model", path, "loaded.") def get_entities(self, entity_pred_tensor): d_id_to_type = self.info_dict["mod_entities_id_to_token_dict"] entity_index = entity_pred_tensor.squeeze().argmax(dim=1) entity_index_np = entity_index.detach().numpy() entity_type = [d_id_to_type[str(i)] for i in entity_index_np] return entity_type def get_relations(self, rel_pred_tensor): rel_list = [] d_id_to_type = self.info_dict["mod_relations_id_to_token_dict"] rel_matrix_index = rel_pred_tensor.squeeze(dim=2).argmax(dim=2) rel_matrix_index_np = rel_matrix_index.detach().numpy() e1_index_array, e2_index_array = rel_matrix_index_np.nonzero() if len(e1_index_array) == 0: return rel_list for i in range(len(e1_index_array)): e1_index = e1_index_array[i] e2_index = e2_index_array[i] rel_index = rel_matrix_index_np[e1_index, e2_index] rel_list.append([e1_index, e2_index, d_id_to_type[str(rel_index)]]) return rel_list def analyze_sentence(self, sentence): tokens_wp, tokens_ids_wp_tensor, segments_ids_wp_tensors, tokens_base = self.bertinizer.tokenize( sentence) bert_embeddings = self.bertinizer.get_embeddings( tokens_ids_wp_tensor, segments_ids_wp_tensors) bert_avg_embeddings = self.bertinizer.average_wp_embeddings( bert_embeddings, tokens_wp) ne_p1, rel_p1, ne_p2, rel_p2 = self.model( bert_avg_embeddings.unsqueeze(dim=1)) entity_type = self.get_entities(ne_p2) rel_list = self.get_relations(rel_p2) return tokens_base, entity_type, rel_list
os.makedirs(dumb_dataset_dir) train_dumb_dataset_path = dumb_dataset_dir + "dumb_train.json" test_dumb_dataset_path = dumb_dataset_dir + "dumb_test.json" if __name__ == "__main__": ddsc = DumbDataSetConstructor(config_path=dumb_dataset_config_path) print("+ Preparing and saving train dataset.") ddsc.generate_dataset(n=6000) data, ne_list, rel_list = ddsc.get_dataset() ddsc.write_json_dataset(train_dumb_dataset_path) print("+ Preparing and saving descriptive json.") sentbertnizer = SentenceBERTinizer() er_aligner = tgtEntRelConstructor(tokenizer=sentbertnizer, ne_tags=ne_list, rel_tags=rel_list) num_ne = er_aligner.NE_vsize num_rel = er_aligner.REL_vsize info_collector = InfoCollector() info_collector.remember_info( entities=ne_list, relations=rel_list, entity_vsize=num_ne, rel_vsize=num_rel, mod_entities=er_aligner.NE_biotags, mod_relations=er_aligner.REL_mod_tags,
# wiki_json_train = "./data/preproc_WikiKBP_json/train.json" # pubmed_json_train = "./data/preproc_PubMed_json/train.json" nyt_json_train = "./../data/preproc_NYT_json/train.json" nyt_json_test = "./../data/preproc_NYT_json/test.json" data_nyt_train, NE_LIST, REL_LIST = get_dataset(nyt_json_train, _bert_wp_tokenizer) data_nyt_test, _, _ = get_dataset(nyt_json_test, _bert_wp_tokenizer) obs = data_nyt_train[5] sentence = obs["sentText"] entityMentions = obs["entityMentions"] relationMentions = obs["relationMentions"] from data_processing.BERTinizer import SentenceBERTinizer sentbertnizer = SentenceBERTinizer() er_aligner = tgtEntRelConstructor(tokenizer=sentbertnizer, ne_tags=NE_LIST, rel_tags=REL_LIST) tokens_base = sentbertnizer.base_tokenize(sentence, clean_marking=False) ne_tensor, rel_tensor = er_aligner.get_ne_rel_tensors(tokens_base, entityMentions, relationMentions) print("+ Original sentence: \t", sentence) print("+ Tokenized sentence (without WordPiece): \t", tokens_base) print("+ Sentence of entities indices: \t", ne_tensor) print() print("+ Original NE: \t", entityMentions) print("+ BIO NE prepared: \t", er_aligner.NE_biotags) print() print("+ Size of NE tensor: \t", ne_tensor.size()) print("+ Size of relation tensor: \t", rel_tensor.size())