def evaluate(self, epoch): ''' only used for dev dataset ''' filename = self.result_dir + '/' + str(epoch) + '.json' print('evaluate ' + filename) re_file = eval_file(filename) gt_set = eval_file(self.gt_file_name) re_set = re_file["submit_result"] match_cnt = 0 M = 0 M_ = 0 for re, gt in zip(re_set, gt_set): re_results = re["mention_result"] gt_results = gt["lab_result"] M += len(gt_results) M_ += len(re_results) flag = True for re_item in re_results: for gt_item in gt_results: if re_item["mention"] == gt_item["mention"] \ and re_item["offset"] == gt_item["offset"] \ and re_item["kb_id"] == gt_item["kb_id"]: # del flag = False match_cnt += 1 # if flag and epoch == 4: # print(re) # print(gt) # print() R = match_cnt / M P = match_cnt / M_ F1 = 2 * P * R / (P + R) return R, P, F1
def __init__(self, file_dir, name_to_com, id_to_com, max_length=160, threshold=0.5, result_dir=None): super().__init__() self.gt_file_name = file_dir self.max_length = max_length self.name_to_com = name_to_com self.id_to_com = id_to_com self.threshold = threshold self.result_dir = result_dir json_data = eval_file(file_dir) self.raw_data, self.data = train_data_prepare(json_data, name_to_com, TOKENIZER, max_length) self._len = self.data['ids'].shape[0] self.ids = torch.tensor(self.data['ids'], dtype=torch.long) self.mask_mat = torch.tensor(self.data['mask_mat'], dtype=torch.long) self.ent_mask = torch.tensor(self.data['ent_mask'], dtype=torch.long) self.kb_ids = torch.tensor(self.data['kb_ids'], dtype=torch.long) self.labels = torch.tensor(self.data['labels'], dtype=torch.uint8) self.offsets = torch.tensor(self.data['offsets'], dtype=torch.long)
from utils.optim import get_optim, adjust_lr os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu torch.backends.cudnn.benchmark = True if args.seed != -1: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) curr_time = datetime.datetime.now() result_dir = '../results/' + curr_time.strftime("%m-%d-%H-%M") os.mkdir(result_dir) os.mkdir(result_dir + '/ckpts') print('Load datasets!') name_to_com = eval_file('../datasets/name_2_company.json') id_to_com = eval_file('../datasets/id_2_company.json') train_data = AnnoData('../datasets/train.json', name_to_com, id_to_com) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=4) val_data = AnnoData('../datasets/dev.json', name_to_com, id_to_com, threshold=args.threshold, result_dir=result_dir) val_loader = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=4) print('datasets successfully loaded!\n') print('config model and optim...')