def valid_model(model_name, ckpt_path): # model = create_model(model_name.split('-')[-2], num_classes=3, checkpoint_path=ckpt_path) model = load_checkpoint(ckpt_path) model.cuda() img_size = int(model_name.split('-')[-1]) interpolation = "bicubic" batch_size = 64 dataset = Dataset(os.path.join(BASE, "valid")) # loader = create_loader( # dataset, # input_size=img_size, # batch_size=batch_size, # use_prefetcher=False, # interpolation=interpolation, # num_workers=4) loader = torch.utils.data.DataLoader( Dataset(root=os.path.join(BASE, "valid"), transform=get_test_transform(img_size)), batch_size=64, num_workers=8, drop_last=False, ) print('..... Finished loading model! ......') class_2_index = {0: 'normal', 1: 'phone', 2: 'smoke'} with open("./txts/v-info-new.json", 'r', encoding="utf-8") as f: shape_dict = json.load(f) ## 特征的维度需要自己根据特定的模型调整,我这里采用的是哪一个我也忘了 labels = [] total_pred_idx = [] dets_info = {} with torch.no_grad(): for batch_idx, (input, target) in enumerate(tqdm(loader)): output = model(input.cuda()) prob = torch.max(torch.softmax(output, -1), -1)[0] idx = torch.max(torch.softmax(output, -1), -1)[1] pred_idx = idx.cpu().numpy() total_pred_idx.extend(pred_idx) for j in range(len(pred_idx)): filename = loader.dataset.filenames()[batch_idx * batch_size + j] name = filename.split('/')[-1].split('.')[0] dets_info[name] = [class_2_index[pred_idx[j]], float(prob[j]), shape_dict[name][1], shape_dict[name][2]] labels.extend(target.cpu().numpy()) with open("%s/v.json" % (feature_path), "w", encoding="utf-8") as f: json.dump(dets_info, f) prec = accuracy_score(labels, total_pred_idx) print("%.4f" % prec)
def train(): # 1. 加载数据集 print("start loading data_set") train_dataset: Dataset = Dataset.load("../data/essay_data/train.pickle") dev_dataset: Dataset = Dataset.load("../data/essay_data/dev.pickle") test_dataset: Dataset = Dataset.load("../data/essay_data/test.pickle") print("end loading data_set") # 2. 计算特征 essay_set_num = len(train_dataset.data) for set_id in range(1, essay_set_num + 1): train_data = train_dataset[str(set_id)] dev_data = dev_dataset[str(set_id)] test_data = test_dataset[str(set_id)] train_every_set(train_data, dev_data, test_data, set_id)
def save_feature_batch(model_name, ckpt_path, feature_path, label_path, data_type="valid"): model = create_model(model_name.split('-')[-2], num_classes=3, checkpoint_path=ckpt_path) model.cuda().eval() print('..... Finished loading model! ......') img_size = int(model_name.split('-')[-1]) interpolation = "bicubic" batch_size = 128 dataset = Dataset(os.path.join(BASE, data_type)) loader = create_loader( dataset, input_size=img_size, batch_size=batch_size, use_prefetcher=False, interpolation=interpolation, num_workers=8) features = [] labels = [] with torch.no_grad(): for batch_idx, (input, target) in enumerate(tqdm(loader)): out = model.forward_features(input.cuda()) out2 = nn.AdaptiveAvgPool2d(1)(out) feature = out2.view(out.size(0), -1) features.append(feature.cpu().numpy()) labels.extend(target.cpu().numpy()) features = np.array(np.vstack(features)) pickle.dump(features, open(feature_path, 'wb')) pickle.dump(labels, open(label_path, 'wb')) print('CNN features obtained and saved.')
def classifier_pred(classifier, shape_path, feature, id, model_name, data_type="valid"): class_2_index = {0: 'normal', 1: 'phone', 2: 'smoke'} dets_info = {} features = pickle.load(open(feature, 'rb')) ids = pickle.load(open(id, 'rb')) predict = classifier.predict(features) # predicted_test_scores = classifier.decision_function(features) # probs = softmax(predicted_test_scores) # prob_list = [prob[int(predict[i])] for i, prob in enumerate(probs)] probs = classifier.predict_proba(features) prob_list = [round(prob[int(predict[i])], 4) for i, prob in enumerate(probs)] prediction = predict.tolist() total_pred_idx = [int(pred) for pred in prediction] total_true_idx = [int(label) for label in ids] with open("./txts/%s.json" % shape_path, 'r', encoding="utf-8") as f: shape_dict = json.load(f) dataset = Dataset(os.path.join(BASE, data_type)) filenames = dataset.filenames() for i, filename in enumerate(filenames): name = filename.split('/')[-1].split('.')[0] dets_info[name] = [class_2_index[int(prediction[i])], prob_list[i], shape_dict[name][1], shape_dict[name][2]] with open("%s/%s.json" % (feature_path, shape_path.split('-')[0]), "w", encoding="utf-8") as f: json.dump(dets_info, f) accuracy = round(accuracy_score(total_true_idx, total_pred_idx), 4) test_map, ap_list = eval_map(detFolder="%s/v.json" % feature_path, gtFolder="txts/v-info-new.json", return_each_ap=True) print("Accuracy: %s, map: %.4f" % (accuracy, test_map)) with open("weights/%s-valid.json" % model_name, 'w', encoding="utf-8") as f: prob_dict = {} prob_dict["prob"] = probs prob_dict["model_weight"] = test_map prob_dict["label_weight"] = ap_list json.dump(prob_dict, f, cls=MyEncoder) return accuracy, round(test_map, 4)
def get_train_feature(self, train_data_set: Dataset): set_id = 1 data, score_list = train_data_set.get_data_list(set_id) wv_similarity = word_vector_similarity_train(data, score_list) print(wv_similarity) pos_bigram = pos_bigram_train(data) print(pos_bigram) # TODO 拼接 return
def test(): train_dataset: Dataset = Dataset.load("../../data/train.pickle") train_data = train_dataset.data['1'] essay_data, token_data, scores = Dataset.get_data_list(train_data, acquire_score=False) result1, result2 = word_length(essay_data) result3, result4, result5 = word_bigram_train(token_data) result6, result7, result8 = pos_bigram_train(token_data) print(result1) print(result1.shape) print(result2) print(result2.shape) print(result3) print(result3.shape) print(result4) print(result4.shape) print(result6) print(result6.shape) print(result7) print(result7.shape)
def classifier_test(model_path, feature, data_type="test"): class_2_index = {0: 'normal', 1: 'calling', 2: 'smoking'} features = pickle.load(open(feature, 'rb')) classifier = joblib.load(model_path) predict = classifier.predict(features) probs = classifier.predict_proba(features) prob_list = [round(prob[int(predict[i])], 4) for i, prob in enumerate(probs)] prediction = predict.tolist() result_list = [] clas_name = model_path.split('/')[-1].split('-')[0] dataset = Dataset(os.path.join(BASE, data_type)) filenames = dataset.filenames() print(clas_name) with open('./infer/result-%s.json' % clas_name, 'w', encoding="utf-8") as out_file: for i in range(len(filenames)): filename = filenames[i].split('/')[-1].strip() name = class_2_index[int(prediction[i])] result_data = {"image_name": str(filename), "category": name, "score": prob_list[i]} result_list.append(result_data) json.dump(result_list, out_file, cls=MyEncoder, indent=4)
def validate(args): # might as well try to validate something args.prefetcher = not args.no_prefetcher # create model model = create_model( args.model, num_classes=args.num_classes, in_chans=3, global_pool=args.gp) if args.checkpoint: load_checkpoint(model, args.checkpoint) param_count = sum([m.numel() for m in model.parameters()]) _logger.info('Model %s created, param count: %d' % (args.model, param_count)) data_config = resolve_data_config(vars(args), model=model) # model, test_time_pool = apply_test_time_pool(model, data_config, args) if torch.cuda.is_available(): model.cuda() criterion = nn.CrossEntropyLoss().cuda() dataset = Dataset(args.data) crop_pct = data_config['crop_pct'] loader = create_loader( dataset, input_size=data_config['input_size'], batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, crop_pct=crop_pct) batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() f1_m = AverageMeter() end = time.time() total_pred_idx = [] total_truth_idx = [] mistake_image = [] mistake_image_dict = {'calling': [], 'normal': [], 'smoking': [], 'smoking_calling': []} # class_2_index = {0: 'normal', 1: 'phone', 2: 'smoke'} class_2_index = {0: 'calling', 1: 'normal', 2: 'smoking', 3: 'smoking_calling'} with open("./txts/%s.json" % json_name, 'r', encoding="utf-8") as f: shape_dict = json.load(f) dets_info = {} model.eval() with torch.no_grad(): # warmup, reduce variability of first batch time, especially for comparing torchscript vs non input = torch.randn((args.batch_size,) + data_config['input_size']) if torch.cuda.is_available(): input = input.cuda() model(input) end = time.time() for batch_idx, (input, target) in enumerate(loader): if args.no_prefetcher and torch.cuda.is_available(): target = target.cuda() input = input.cuda() # compute output # t0 = time.time() output = model(input) # print("time0: %.8f s" % ((time.time() - t0))) # t1 = time.time() # out = output.detach().cpu() # print("time1: %.8f s" % ((time.time() - t1) / 64)) # print("time2: %.8f s" % ((time.time() - t0) / 64)) # t2 = time.time() # out = out.cuda().cpu() # print("time3: %.8f s" % ((time.time() - t2) / 64)) # get prediction index and ground turth index prob = torch.max(F.softmax(output, -1), -1)[0] idx = torch.max(F.softmax(output, -1), -1)[1] target_idx = target.cpu().numpy() predict_idx = idx.cpu().numpy() for j in range(len(target_idx)): total_truth_idx.append(target_idx[j]) total_pred_idx.append(predict_idx[j]) class_dict = loader.dataset.class_to_idx target_class = list(class_dict.keys())[list(class_dict.values()).index(int(target_idx[j]))] pred_class = list(class_dict.keys())[list(class_dict.values()).index(int(predict_idx[j]))] filename = loader.dataset.filenames()[batch_idx * args.batch_size + j] name = filename.split('/')[-1].split('.')[0] dets_info[name] = [pred_class, float(prob[j]), shape_dict[name][1], shape_dict[name][2]] if target_idx[j] != predict_idx[j]: mistake_image.append( [loader.dataset.filenames()[batch_idx * args.batch_size + j], target_class, pred_class, np.round(prob[j].cpu().numpy(), 4)]) mistake_image_dict[class_2_index[predict_idx[j]]].append( loader.dataset.filenames()[batch_idx * args.batch_size + j]) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 3)) losses.update(loss.item(), input.size(0)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % args.log_freq == 0: _logger.info( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' 'Acc@1: {top1.val:>7.2f} ({top1.avg:>7.2f}) ' 'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format( batch_idx, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, loss=losses, top1=top1, top5=top5)) with open("%s/%s.json" % (output_path, json_name.split('-')[0]), "w", encoding="utf-8") as f: json.dump(dets_info, f) top1a, top5a = top1.avg, top5.avg results = OrderedDict( top1=round(top1a, 4), top1_err=round(100 - top1a, 4), top5=round(top5a, 4), top5_err=round(100 - top5a, 4), param_count=round(param_count / 1e6, 2), img_size=data_config['input_size'][-1], cropt_pct=crop_pct, interpolation=data_config['interpolation'], mistake_image_dict=mistake_image_dict, pred_idx=total_pred_idx, truth_idx=total_truth_idx) _logger.info(' * Acc@1 {:.2f} ({:.2f}) Acc@5 {:.2f} ({:.2f})'.format( results['top1'], results['top1_err'], results['top5'], results['top5_err'])) map, each_ap = eval_map(detFolder="%s/%s.json" % (output_path, json_name.split('-')[0]), gtFolder="txts/%s.json" % json_name, return_each_ap=True) _logger.info('Valid mAP: {}, each ap: {}'.format(round(map, 4), each_ap)) return results
import sys sys.path.append("../..") from src.feature.iku import spell_error, Mean_sentence_depth from src.data import Dataset test_dataset = Dataset.load("../../data/test.pickle") # print(test_dataset.data) for data in test_dataset.data['1']: Mean_sentence_depth(data)
def main(): setup_default_logging() args, args_text = _parse_args() args.prefetcher = not args.no_prefetcher torch.manual_seed(args.seed) model = create_model(args.model, pretrained=True, num_classes=args.num_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, checkpoint_path=args.initial_checkpoint) if args.local_rank == 0: _logger.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) if args.num_gpu > 1: model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() optimizer = create_optimizer(args, model) loss_scaler = None # optionally resume from a checkpoint resume_epoch = None if args.resume: resume_epoch = resume_checkpoint( model, args.resume, optimizer=None if args.no_resume_opt else optimizer, loss_scaler=None if args.no_resume_opt else loss_scaler, log_info=args.local_rank == 0) lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: _logger.info('Scheduled epochs: {}'.format(num_epochs)) train_dir = os.path.join(args.data, 'train') if not os.path.exists(train_dir): _logger.error( 'Training folder does not exist at: {}'.format(train_dir)) exit(1) dataset_train = Dataset(train_dir) collate_fn = None mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_args = dict(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, elementwise=args.mixup_elem, label_smoothing=args.smoothing, num_classes=args.num_classes) if args.prefetcher: collate_fn = FastCollateMixup(**mixup_args) else: mixup_fn = Mixup(**mixup_args) loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, re_prob=args.reprob, re_mode=args.remode, re_count=args.recount, color_jitter=args.color_jitter, auto_augment=args.aa, num_workers=args.workers, collate_fn=collate_fn, ) eval_dir = os.path.join(args.data, 'valid') if not os.path.isdir(eval_dir): eval_dir = os.path.join(args.data, 'validation') if not os.path.isdir(eval_dir): _logger.error( 'Validation folder does not exist at: {}'.format(eval_dir)) exit(1) dataset_eval = Dataset(eval_dir) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=False, use_prefetcher=args.prefetcher, num_workers=args.workers, crop_pct=data_config['crop_pct'], ) if mixup_active: # smoothing is handled with mixup target transform train_loss_fn = SoftTargetCrossEntropy().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=args.smoothing).cuda() train_loss_ce = nn.CrossEntropyLoss().cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' plateau_num = 0 if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(data_config['input_size'][-1]) ]) output_dir = get_outdir(output_base, exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(model=model, optimizer=optimizer, args=args, amp_scaler=loss_scaler, checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=2) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) with open("./txts/%s.json" % json_name, 'r', encoding="utf-8") as f: shape_dict = json.load(f) try: for epoch in range(start_epoch, num_epochs): train_metrics = train_epoch(epoch, model, loader_train, optimizer, [train_loss_fn, train_loss_ce], args, lr_scheduler=lr_scheduler, output_dir=output_dir, mixup_fn=mixup_fn) eval_metrics, dets_info = validate(model, loader_eval, validate_loss_fn, args, shape_dict=shape_dict) with open("%s/v.json" % output_dir, "w", encoding="utf-8") as f: json.dump(dets_info, f) map = round( eval_map(detFolder="%s/v.json" % output_dir, gtFolder="txts/%s.json" % json_name), 4) eval_metrics["map"] = map _logger.info('Valid mAP: {}'.format(map)) if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] saver.save_prefix = "%.2f-%s" % (eval_metrics["top1"], map) best_metric, best_epoch = saver.save_checkpoint( epoch, metric=save_metric) if eval_metrics[eval_metric] >= best_metric: plateau_num = 0 else: plateau_num += 1 # 超过30个epoch还没有更新metric,停止运行 if plateau_num == 30: break except KeyboardInterrupt: pass if best_metric is not None: _logger.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
import sys sys.path.append("../..") from src.feature.iku import spell_error, Mean_sentence_depth_level, semantic_vector_similarity, essay_length from src.data import Dataset from gensim import corpora, models from gensim.similarities import MatrixSimilarity # test_dataset= Dataset.load("../../data/test.pickle") train_dataset = Dataset.load("../../data/train.pickle") Mean_sentence_depth_level(train_dataset.data['1']) # train_dataset = Dataset() # train_dataset.load_from_raw_file("../../data/train.tsv", ['essay_set', 'essay_id', 'essay', 'domain1_score']) # Dataset.save(train_dataset, '../../data/train.pickle') # dev_dataset = Dataset() # dev_dataset.load_from_raw_file("../../data/dev.tsv", ['essay_set', 'essay_id', 'essay', 'domain1_score']) # Dataset.save(dev_dataset, '../../data/dev.pickle') # test_dataset = Dataset() # test_dataset.load_from_raw_file("../../data/test.tsv", ['essay_set', 'essay_id', 'essay']) # Dataset.save(test_dataset, '../../data/test.pickle') # print(test_dataset.data) # spell_error(train_dataset.data['3']) # semantic_vector_similarity(train_dataset.data['3'], train_dataset.data['3']) # essay_length(train_dataset.data['1']) # for data in test_dataset.data['1']: # print(type) # # Mean_sentence_depth(data)
def main(): setup_default_logging() args = parser.parse_args() # might as well try to do something useful... args.pretrained = args.pretrained or not args.checkpoint if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) args.checkpoint = glob.glob(args.checkpoint + '/*.pth')[0] # create model model = create_model(args.model, num_classes=args.num_classes, in_chans=3, pretrained=args.pretrained) load_checkpoint(model, args.checkpoint) logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) args.img_size = int(args.checkpoint.split('/')[-2].split('-')[-1]) config = resolve_data_config(vars(args), model=model) # model, test_time_pool = apply_test_time_pool(model, config, args) if torch.cuda.is_available(): model = model.cuda() loader = create_loader(Dataset(args.data), input_size=config['input_size'], batch_size=args.batch_size, use_prefetcher=False, interpolation=config['interpolation'], mean=config['mean'], std=config['std'], num_workers=args.workers, crop_pct=config['crop_pct']) model.eval() batch_time = AverageMeter() end = time.time() topk_ids = [] scores = [] total_pred_idx = [] total_truth_idx = [] with torch.no_grad(): for batch_idx, (input, target) in enumerate(loader): if torch.cuda.is_available(): input = input.cuda() output = model(input) prob = torch.max(F.softmax(output, -1), -1)[0] idx = torch.max(F.softmax(output, -1), -1)[1] total_pred_idx.extend(idx.cpu().numpy()) total_truth_idx.extend(target.cpu().numpy()) scores.extend(prob.cpu().numpy()) topk_ids.extend(idx.cpu().numpy()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % args.log_freq == 0: logging.info( 'Predict: [{0}/{1}] Time {batch_time.val:.3f} ({batch_time.avg:.3f})' .format(batch_idx, len(loader), batch_time=batch_time)) # result_file_path = os.path.join('./results', model_name) # if not os.path.exists(result_file_path): # os.makedirs(result_file_path) # res_cf = open('%s/results-all.csv' % result_file_path, mode='w') # for i in range(len(total_pred_idx)): # res_cf.write('{0},'.format(str(total_pred_idx[i]))) # res_cf.write('\n') # for i in range(len(total_truth_idx)): # res_cf.write('{0},'.format(str(total_truth_idx[i]))) # dst_root = './infer/%s' % args.checkpoint.split('/')[-2] # if not os.path.exists(dst_root): # os.makedirs(dst_root) # else: # shutil.rmtree(dst_root) result_list = [] # class_2_index = {0: 'normal', 1: 'calling', 2: 'smoking'} class_2_index = { 0: 'calling', 1: 'normal', 2: 'smoking', 3: 'smoking_calling' } with open(os.path.join(args.output_dir, 'result.json'), 'w', encoding="utf-8") as out_file: filenames = loader.dataset.filenames() for i in range(len(scores)): filename = filenames[i].split('/')[-1] name = class_2_index[topk_ids[i]] result_data = { "image_name": str(filename), "category": name, "score": scores[i] } result_list.append(result_data) # if scores[i] > 0.95: # dst_path = os.path.join(dst_root, name) # if not os.path.exists(dst_path): # os.makedirs(dst_path) # shutil.copy(filenames[i], os.path.join(dst_path, filename)) json.dump(result_list, out_file, cls=MyEncoder, indent=4)
def data(args): bpath = os.path.join('./data', args.dataset) with open(os.path.join(bpath, 'entity2id.txt'), 'r') as f: e_ix = dict(map(lambda x: x.split()[::-1], f.read().split('\n')[1:-1])) tp_ix, tp_rix = dict(), dict() tp_rx = re.compile(r'^/(\w+)/.*$') for i, e in e_ix.items(): tp = re.findall(tp_rx, e)[0] if tp not in tp_ix: tp_ix[tp] = list() tp_ix[tp].append(int(i)) tp_rix[int(i)] = tp with open(os.path.join(bpath, 'entity2id.txt'), 'r') as f: e_ix_ln = int(f.readline().strip()) with open(os.path.join(bpath, 'relation2id.txt'), 'r') as f: r_ix_ln = int(f.readline().strip()) tr_ds = Dataset(args, os.path.join(bpath, 'train2id.txt'), e_ix_ln, tp_ix, tp_rix, 1) vd_ds = Dataset(args, os.path.join(bpath, 'valid2id.txt'), e_ix_ln, tp_ix, tp_rix, 2) ts_ds = Dataset(args, os.path.join(bpath, 'test2id.txt'), e_ix_ln, tp_ix, tp_rix, 3) if args.model.startswith('DE'): t_ix = FakeTimeIndex() else: al_t = np.concatenate([tr_ds, vd_ds, ts_ds], axis=1)[0, :, 3:].flatten() t_ix = {e: i for i, e in enumerate(np.unique(al_t))} t_ix_ln = len(t_ix) tr_ds.transform(t_ix, qs_bs={}) vd_ds.transform(t_ix, qs_bs=tr_ds._qs) ts_ds.transform(t_ix, qs=False) tr_smp = DistributedSampler(tr_ds, num_replicas=_size(args), rank=_rank(args)) vd_smp = DistributedSampler(vd_ds, num_replicas=_size(args), rank=_rank(args), shuffle=False) ts_smp = DistributedSampler(ts_ds, num_replicas=_size(args), rank=_rank(args), shuffle=False) tr_dl = DataLoader(tr_ds, batch_size=args.batch_size, sampler=tr_smp, num_workers=args.workers, pin_memory=not args.tpu, drop_last=args.tpu) vd_dl = DataLoader(vd_ds, batch_size=args.test_batch_size, sampler=vd_smp, num_workers=args.workers, pin_memory=not args.tpu, drop_last=args.tpu) ts_dl = DataLoader(ts_ds, batch_size=args.test_batch_size, sampler=ts_smp, num_workers=args.workers, pin_memory=not args.tpu, drop_last=args.tpu) return tr_dl, vd_dl, ts_dl, e_ix_ln, r_ix_ln, t_ix_ln, tp_ix, tp_rix
def train(contain_test=False, use_save=False, model_name='SVR'): """ 训练模型 """ # 1. 加载数据集 print("start loading data_set") train_dataset: Dataset = Dataset.load(TRAIN_DADA_PATH) dev_dataset: Dataset = Dataset.load(DEV_DATA_PATH) test_dataset: Dataset = Dataset.load(TEST_DATA_PATH) print("end loading data_set") # 2. 计算特征 essay_set_num = len(train_dataset.data) print(essay_set_num) mean_qwk = 0 all_test_sample = [] qwk_score_list = [] use_dev = '' for set_id in range(1, essay_set_num + 1): train_data = train_dataset.data[str(set_id)] dev_data = dev_dataset.data[str(set_id)] test_data = test_dataset.data[str(set_id)] train_feature_dict = train_dataset.load_feature(set_id, 'train') feature_class = Feature.get_instance(train_feature_dict) new_train_data = copy.deepcopy(train_data) new_train_data.extend(dev_data) train_data = new_train_data train_sentences_list, train_tokens_list, train_scores = Dataset.get_data_list( train_data, acquire_score=True) print( "start compute the feature for essay set %s, train_set_len = %s" % (set_id, len(train_sentences_list))) st = time.time() # TODO 需要啥填什麽 reset_list = [] train_feature, train_feature_dict = feature_class.get_saved_feature_all( train_feature_dict, train_sentences_list, train_tokens_list, train_data, train_scores, 'train', feature_list) train_dataset.save_feature(set_id, train_feature_dict, 'train') et = time.time() print("end compute the feature for essay set, ", set_id, "time = ", et - st) # 3. 构建模型,训练 use_dev = 'No' # 手动修改 clf = model(model_name, train_feature, train_scores, set_id) # 4. 测试 dev_sentences_list, dev_tokens_list, dev_scores = Dataset.get_data_list( dev_data, acquire_score=True) dev_feature_dict = train_dataset.load_feature(set_id, 'dev') dev_feature, dev_feature_dict = feature_class.get_saved_feature_all( dev_feature_dict, dev_sentences_list, dev_tokens_list, dev_data, train_scores, 'dev', reset_list) train_dataset.save_feature(set_id, dev_feature_dict, 'dev') print('dev ends') predicted = clf.predict(dev_feature) qwk = kappa(dev_scores, predicted, weights='quadratic') print(set_id, qwk) qwk_score_list.append(qwk) mean_qwk += qwk test_sentences_list, test_tokens_list = Dataset.get_data_list( test_data, acquire_score=False) test_feature_dict = train_dataset.load_feature(set_id, 'test') test_feature, test_feature_dict = feature_class.get_saved_feature_all( test_feature_dict, test_sentences_list, test_tokens_list, test_data, train_scores, 'test', reset_list) train_dataset.save_feature(set_id, test_feature_dict, 'test') test_predicted = clf.predict(test_feature) for idx, sample in enumerate(test_data): # sample['domain1_score'] = int(test_predicted[idx]) sample['domain1_score'] = int(np.round(float(test_predicted[idx]))) all_test_sample.extend(test_data) save_to_tsv(all_test_sample, '../MG1933004.tsv') mean_qwk = mean_qwk / essay_set_num print(mean_qwk) save_info_to_file(feature_list, use_dev, qwk_score_list, mean_qwk) # 保存特征 只能保存dataset对象了 train_dataset.save(train_dataset, TRAIN_DADA_PATH)
def test_20_newsgroups(self): ds = Dataset.load('20_newsgroups') ds = Dataset.load('20_newsgroups') assert len(ds.data) == 18846 assert len(ds.target) == 18846