tokenizer = BertTokenizer.from_pretrained(model_config['path'], do_lower_case=True) cv_loaders = get_data_loaders_cv( rv_path='../data/TRAIN/Train_laptop_reviews.csv', lb_path='../data/TRAIN/Train_laptop_labels.csv', tokenizer=tokenizer, batch_size=args.bs, type='laptop', folds=FOLDS) BEST_THRESHS = [0.1] * FOLDS BEST_F1 = [0] * FOLDS for cv_idx, (train_loader, val_loader) in enumerate(cv_loaders): model = OpinioNet.from_pretrained(model_config['path'], version=model_config['version'], focal=model_config['focal']) model.load_state_dict( torch.load('../models/pretrained_' + model_config['name'])) model.cuda() optimizer = Adam(model.parameters(), lr=model_config['lr']) scheduler = GradualWarmupScheduler(optimizer, total_epoch=10 * len(train_loader)) best_val_f1 = 0 best_val_loss = float('inf') for e in range(EP): print('Epoch [%d/%d] train:' % (e, EP)) train_loss, train_f1, train_pr, train_rc = train_epoch( model, train_loader, optimizer, scheduler, type='laptop')
if __name__ == '__main__': EP = 100 SAVING_DIR = '../models/' tokenizer = BertTokenizer.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch', do_lower_case=True) train_loader, val_loader = get_data_loaders( rv_path='../data/TRAIN/Train_reviews.csv', lb_path='../data/TRAIN/Train_labels.csv', tokenizer=tokenizer, batch_size=12, val_split=0.15) model = OpinioNet.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch') model.cuda() optimizer = Adam(model.parameters(), lr=5e-6) scheduler = GradualWarmupScheduler(optimizer, total_epoch=2) best_val_f1 = 0 best_val_loss = float('inf') for e in range(EP): print('Epoch [%d/%d] train:' % (e, EP)) train_loss, train_f1, train_pr, train_rc = train_epoch( model, train_loader, optimizer, scheduler) print("loss %.5f, f1 %.5f, pr %.5f, rc %.5f" % (train_loss, train_f1, train_pr, train_rc)) print('Epoch [%d/%d] eval:' % (e, EP)) val_loss, val_f1, val_pr, val_rc = eval_epoch(model, val_loader)
return total_loss, total_f1, total_pr, total_rc if __name__ == '__main__': EP = 100 SAVING_DIR = '../models/' tokenizer = BertTokenizer.from_pretrained('/home/zydq/.torch/models/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12', do_lower_case=True) # tokenizer = BertTokenizer.from_pretrained('/home/zydq/.tf/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12', # do_lower_case=True) makeup_train_loader, makeup_val_loader, laptop_train_loader, laptop_val_loader, corpus_loader = \ get_data_loaders_round2(tokenizer, batch_size=12) model = OpinioNet.from_pretrained('/home/zydq/.torch/models/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12') # model = OpinioNet.from_pretrained('/home/zydq/.tf/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12', from_tf=True) model.cuda() optimizer = Adam(model.parameters(), lr=6e-6) scheduler = GradualWarmupScheduler(optimizer, total_epoch=2*max(len(makeup_train_loader), len(corpus_loader))) best_val_f1 = 0 best_val_loss = float('inf') for e in range(EP): print('Epoch [%d/%d] train:' % (e, EP)) train_loss, train_lm_loss, train_f1, train_pr, train_rc = train_epoch(model, makeup_train_loader, laptop_train_loader, corpus_loader, optimizer, scheduler) print("loss %.5f, lm loss %.5f f1 %.5f, pr %.5f, rc %.5f" % (train_loss, train_lm_loss, train_f1, train_pr, train_rc)) print('Epoch [%d/%d] makeup eval:' % (e, EP)) val_loss, val_f1, val_pr, val_rc = eval_epoch(model, makeup_val_loader, type='makeup') print("makeup_val: loss %.5f, f1 %.5f, pr %.5f, rc %.5f" % (val_loss, val_f1, val_pr, val_rc))
] tokenizer = BertTokenizer.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch', do_lower_case=True) test_dataset = ReviewDataset('../data/TEST/Test_reviews.csv', None, tokenizer) test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) ret = None for name in MODELS: model_path = osp.join(SAVING_DIR, name) model = OpinioNet.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch') model.load_state_dict(torch.load(model_path)) model.cuda() ret = accum_result(ret, eval_epoch(model, test_loader)) del model ret = average_result(ret, len(MODELS)) ret = OpinioNet.nms_filter(ret, THRESH) raw = [s[0][0] for s in test_dataset.samples] result = gen_submit(ret, raw) import time result.to_csv('../submit/ensemble-' + str(round(time.time())) + '.csv', header=False, index=False) print(len(result['id'].unique()), result.shape[0])
tokenizer = BertTokenizer.from_pretrained(model_config['path'], do_lower_case=True) test_dataset = ReviewDataset(args.rv, args.lb, tokenizer, 'laptop') test_loader = DataLoader(test_dataset, args.bs, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) if not raw: raw = [s[0][0] for s in test_dataset.samples] if not lb and args.lb: lb = [s[0][1] for s in test_dataset.samples] model = OpinioNet.from_pretrained(model_config['path'], version=model_config['version'], focal=model_config['focal']) print(weight_name) model.load_state_dict(torch.load('../models/' + weight_name)) model.cuda() ret = accum_result(ret, eval_epoch(model, test_loader, thresh)) del model ret = average_result(ret, num_model) ret = OpinioNet.nms_filter(ret, 0.28) if args.lb: def f1_score(P, G, S): pr = S / P rc = S / G f1 = 2 * pr * rc / (pr + rc)
type='laptop', folds=FOLDS)) for model_name, model_config in PRETRAINED_MODELS.items() ]) PRED = [] for cv_idx in range(FOLDS): cv_model_num = 0 cvret = None for model_name, model_config in PRETRAINED_MODELS.items(): tokenizer = tokenizers[model_name] _, val_loader = cv_loaders[model_name][cv_idx] try: model = OpinioNet.from_pretrained( model_config['path'], version=model_config['version'], focal=model_config['focal']) weight_name = model_config['name'] + '_cv' + str(cv_idx) weight = torch.load('../models/' + weight_name) except FileNotFoundError: continue print(weight_name) model.load_state_dict(weight) model.cuda() try: thresh = thresh_dict[weight_name]['thresh'] except: thresh = 0.5 cvret = accum_result(cvret, eval_epoch(model, val_loader, thresh)) cv_model_num += 1 del model