join("../ganData/facades_fixed", "test")) train_data_loader = DataLoader(dataset=train_set, num_workers=2, batch_size=BATCH_SIZE, shuffle=True) test_data_loader = DataLoader(dataset=test_set, num_workers=2, batch_size=BATCH_SIZE, shuffle=True) test_input, test_target = test_data_loader.__iter__().__next__() real_a = torch.FloatTensor(BATCH_SIZE, IMAGE_CHANNEL, IMAGE_SIZE, IMAGE_SIZE) real_b = torch.FloatTensor(BATCH_SIZE, OUTPUT_CHANNEL, IMAGE_SIZE, IMAGE_SIZE) if GPU_NUMS > 1: Net_G = Net_G.cuda() Net_D = Net_D.cuda() lossGAN = lossGAN.cuda() lossL1 = lossL1.cuda() lossMSE = lossMSE.cuda() real_a = Variable(real_a.cuda() if GPU_NUMS > 1 else real_a) real_b = Variable(real_b.cuda() if GPU_NUMS > 1 else real_b) bar = ProgressBar(EPOCHS, len(train_data_loader), "D loss:%.3f;G loss:%.3f") for epoch in range(EPOCHS): for iteration, batch in enumerate(train_data_loader, 1): real_a_cpu, real_b_cpu = batch[0], batch[1] real_a.data.resize_(real_a_cpu.size()).copy_(real_a_cpu) real_b.data.resize_(real_b_cpu.size()).copy_(real_b_cpu) fake_b = Net_G(real_a)
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 800 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) model = multi_lstm() model = DataParallel(model) model.load_state_dict( torch.load( 'cnn_lstm_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_200_train1_9998_train2_9987_val1_9731_val2_8752.pth' )) kl_fc_t2p = nn.Linear(7, 7) all_tool_to_phase = np.load('kl_fc_t2p.npy') kl_fc_t2p.weight.data = torch.from_numpy( all_tool_to_phase.astype('float32')) for param in kl_fc_t2p.parameters(): param.requires_grad = True if use_gpu: model = model.cuda() kl_fc_t2p = kl_fc_t2p.cuda() criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) criterion_3 = nn.KLDivLoss(size_average=False) softmax_cuda = nn.Softmax().cuda() sigmoid_cuda = nn.Sigmoid().cuda() if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD([model.parameters(), kl_fc_t2p.parameters()], lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam( [model.parameters(), kl_fc_t2p.parameters()], lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': kl_fc_t2p.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.share.parameters() }, { 'params': kl_fc_t2p.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 # judge by accu2 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 # 要存储2个train的准确率 2个valid的准确率 3个train 3个loss的loss, 一共12个数据要记录 record_np = np.zeros([epochs, 10]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss_1 = 0.0 train_loss_2 = 0.0 train_loss_3 = 0.0 train_corrects_1 = 0 train_corrects_2 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_1, outputs_2 = model.forward(inputs) sig_output_1 = sigmoid_cuda(outputs_1) soft_output_2 = softmax_cuda(outputs_2) sig_output_1 = Variable(sig_output_1.data, requires_grad=False) soft_output_2 = Variable(soft_output_2.data, requires_grad=False) kl_output_1 = kl_fc_t2p(sig_output_1) preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5) preds_1 = preds_1.long() train_corrects_1 += torch.sum(preds_1 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) _, preds_2 = torch.max(outputs_2.data, 1) train_corrects_2 += torch.sum(preds_2 == labels_2.data) loss_3 = torch.abs(criterion_3(kl_output_1, soft_output_2)) loss = loss_1 + loss_2 + loss_3 loss.backward() optimizer.step() train_loss_1 += loss_1.data[0] train_loss_2 += loss_2.data[0] train_loss_3 += loss_3.data[0] train_elapsed_time = time.time() - train_start_time train_accuracy_1 = train_corrects_1 / num_train_all / 7 train_accuracy_2 = train_corrects_2 / num_train_all train_average_loss_1 = train_loss_1 / num_train_all / 7 train_average_loss_2 = train_loss_2 / num_train_all train_average_loss_3 = train_loss_3 / num_train_all # begin eval model.eval() val_loss_1 = 0.0 val_loss_2 = 0.0 val_loss_3 = 0.0 val_corrects_1 = 0 val_corrects_2 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) if crop_type == 0 or crop_type == 1: outputs_1, outputs_2 = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(5, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(5, -1, 7) outputs_2 = torch.mean(outputs_2, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(10, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(10, -1, 7) outputs_2 = torch.mean(outputs_2, 0) sig_output_1 = sigmoid_cuda(outputs_1) soft_output_2 = softmax_cuda(outputs_2) sig_output_1 = Variable(sig_output_1.data, requires_grad=False) soft_output_2 = Variable(soft_output_2.data, requires_grad=False) kl_output_1 = (kl_fc_t2p(sig_output_1)) outputs_2 = outputs_2[sequence_length - 1::sequence_length] _, preds_2 = torch.max(outputs_2.data, 1) preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5) preds_1 = preds_1.long() val_corrects_1 += torch.sum(preds_1 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) val_corrects_2 += torch.sum(preds_2 == labels_2.data) loss_3 = torch.abs(criterion_3(kl_output_1, soft_output_2)) val_loss_1 += loss_1.data[0] val_loss_2 += loss_2.data[0] val_loss_3 += loss_3.data[0] val_elapsed_time = time.time() - val_start_time val_accuracy_1 = val_corrects_1 / (num_val_all * 7) val_accuracy_2 = val_corrects_2 / num_val_we_use val_average_loss_1 = val_loss_1 / (num_val_all * 7) val_average_loss_2 = val_loss_2 / num_val_we_use val_average_loss_3 = val_loss_3 / num_val_all print('epoch: {:3d}' ' train time: {:2.0f}m{:2.0f}s' ' train accu_1: {:.4f}' ' train accu_2: {:.4f}' ' train loss_1: {:4.4f}' ' train loss_2: {:4.4f}' ' train loss_3: {:4.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_accuracy_1, train_accuracy_2, train_average_loss_1, train_average_loss_2, train_average_loss_3)) print('epoch: {:3d}' ' valid time: {:2.0f}m{:2.0f}s' ' valid accu_1: {:.4f}' ' valid accu_2: {:.4f}' ' valid loss_1: {:4.4f}' ' valid loss_2: {:4.4f}' ' valid loss_3: {:4.4f}'.format(epoch, val_elapsed_time // 60, val_elapsed_time % 60, val_accuracy_1, val_accuracy_2, val_average_loss_1, val_average_loss_2, val_average_loss_3)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2) if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95: best_val_accuracy_2 = val_accuracy_2 best_val_accuracy_1 = val_accuracy_1 correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95: if val_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_1 == best_val_accuracy_1: if train_accuracy_2 > correspond_train_acc_2: correspond_train_acc_2 = train_accuracy_2 correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) elif train_accuracy_2 == correspond_train_acc_2: if train_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy_1 record_np[epoch, 1] = train_accuracy_2 record_np[epoch, 2] = train_average_loss_1 record_np[epoch, 3] = train_average_loss_2 record_np[epoch, 4] = train_average_loss_3 record_np[epoch, 5] = val_accuracy_1 record_np[epoch, 6] = val_accuracy_2 record_np[epoch, 7] = val_average_loss_1 record_np[epoch, 8] = val_average_loss_2 record_np[epoch, 9] = val_average_loss_3 print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format( best_val_accuracy_1, correspond_train_acc_1)) print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format( best_val_accuracy_2, correspond_train_acc_2)) save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000)) save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000)) save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000)) save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000)) public_name = "cnn_lstm_klt2p" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train1_" + str(save_train_1) \ + "_train2_" + str(save_train_2) \ + "_val1_" + str(save_val_1) \ + "_val2_" + str(save_val_2) model_name = public_name + ".pth" torch.save(best_model_wts, model_name) record_name = public_name + ".npy" np.save(record_name, record_np) kl_fc_t2p_name = public_name + "t2p.npy" kl_fc_t2p_np = kl_fc_t2p.cpu().weight.data.numpy() np.save(kl_fc_t2p_name, kl_fc_t2p_np)
def test_model(test_dataset, test_num_each): num_test = len(test_dataset) test_idx = [i for i in range(num_test)] print('num of test dataset: {:6d}'.format(num_test)) test_loader = DataLoader( test_dataset, batch_size=test_batch_size, sampler=test_idx, num_workers=workers, pin_memory=False ) model = pure_resnet() model = DataParallel(model) model.load_state_dict(torch.load(model_name)) if use_gpu: model = model.cuda() criterion = nn.CrossEntropyLoss(size_average=False) model.eval() test_loss = 0.0 test_corrects = 0 all_preds = [] test_start_time = time.time() for data in test_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels = Variable(labels_2, volatile=True) if crop_type == 0 or crop_type == 1: outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 7) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 7) outputs = torch.mean(outputs, 0) _, preds = torch.max(outputs.data, 1) for i in range(len(preds)): all_preds.append(preds[i]) loss = criterion(outputs, labels) test_loss += loss.data[0] test_corrects += torch.sum(preds == labels.data) # print(test_corrects) test_elapsed_time = time.time() - test_start_time test_accuracy = test_corrects / num_test test_average_loss = test_loss / num_test save_test = int("{:4.0f}".format(test_accuracy * 10000)) pred_name = model_pure_name + '_test_' + str(save_test)+'_crop_' + str(crop_type) + '.pkl' with open(pred_name, 'wb') as f: pickle.dump(all_preds, f) print('test elapsed: {:2.0f}m{:2.0f}s' ' test loss: {:4.4f}' ' test accu: {:.4f}' .format(test_elapsed_time // 60, test_elapsed_time % 60, test_average_loss, test_accuracy))
def main(): args = parser.parse_args() log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not os.path.exists(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not os.path.exists(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device if not args.all_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # cudnn.enabled = False # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model_params['num_classes'] = args.num_classes model_params['in_channels'] = args.in_channels if 'efficientnet' in args.arch: model_params['image_size'] = args.img_size model_params['encoder'] = args.effnet_encoder model = init_network(model_params) if args.load_state_dict_path is not None: if args.load_state_dict_path == 'use-img-level-densenet-ckpt': model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6' pretrained_ckpt_path = os.path.join(f'{model_dir}', f'fold{args.fold}', 'final.pth') else: pretrained_ckpt_path = args.load_state_dict_path init_pretrained = torch.load(pretrained_ckpt_path) model.load_state_dict(init_pretrained['state_dict']) if args.all_gpus: model = DataParallel(model) model.cuda() # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_loss = 1e5 best_epoch = 0 best_focal = float('inf') # define scheduler try: scheduler = eval(args.scheduler)( scheduler_lr_multiplier=args.scheduler_lr_multiplier, scheduler_epoch_offset=args.scheduler_epoch_offset) except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # optionally resume from a checkpoint if args.resume: args.resume = os.path.join(model_out_dir, args.resume) if os.path.isfile(args.resume): # load checkpoint weights and update model and optimizer log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] best_epoch = checkpoint['best_epoch'] best_focal = checkpoint['best_map'] model.load_state_dict(checkpoint['state_dict']) optimizer_fpath = args.resume.replace('.pth', '_optim.pth') if os.path.exists(optimizer_fpath): log.write(">> Loading checkpoint:\n>> '{}'\n".format( optimizer_fpath)) optimizer.load_state_dict( torch.load(optimizer_fpath)['optimizer']) log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format( args.resume, checkpoint['epoch'])) else: log.write(">> No checkpoint found at '{}'\n".format(args.resume)) # Data loading code train_transform = train_multi_augment2 with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f: folds = pickle.load(f) fold = args.fold trn_img_paths, val_img_paths = folds[fold] train_df = get_train_df_ohe(clean_from_duplicates=True) basepath_2_ohe_vector = { img: vec for img, vec in zip(train_df['img_base_path'], train_df.iloc[:, 2:].values) } public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True) public_basepath_2_ohe_vector = { img_path: vec for img_path, vec in zip(public_hpa_df_17['img_base_path'], public_hpa_df_17.iloc[:, 2:].values) } basepath_2_ohe_vector.update(public_basepath_2_ohe_vector) available_paths = set( np.concatenate((train_df['img_base_path'].values, public_hpa_df_17['img_base_path'].values))) trn_img_paths = [path for path in trn_img_paths if path in available_paths] val_img_paths = [path for path in val_img_paths if path in available_paths] labels_df = pd.read_hdf(args.cell_level_labels_path) # modifying minor class labels cherrypicked_mitotic_spindle = pd.read_csv( '../input/mitotic_cells_selection.csv') cherrypicked_mitotic_spindle_img_cell = set( cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple, axis=1).values) cherrypicked_mitotic_spindle_img_cell = { (img, cell_i - 1) for img, cell_i in cherrypicked_mitotic_spindle_img_cell } class_names = get_class_names() mitotic_spindle_class_i = class_names.index('Mitotic spindle') if args.include_nn_mitotic: cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_pos_nn_added.csv') cherrypicked_mitotic_spindle_img_cell.update( set(cherrypicked_mitotic_spindle_based_on_nn[[ 'ID', 'cell_i' ]].apply(tuple, axis=1).values)) print('len cherrypicked_mitotic_spindle_img_cell', len(cherrypicked_mitotic_spindle_img_cell)) mitotic_bool_idx = labels_df.index.isin( cherrypicked_mitotic_spindle_img_cell) def modify_label(labels, idx, val): labels[idx] = val return labels labels_df.loc[mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[ mitotic_bool_idx, 'image_level_pred'].map( lambda x: modify_label(x, mitotic_spindle_class_i, 1)) if args.include_nn_mitotic: cherrypicked_not_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_neg_nn_added.csv') cherrypicked_not_mitotic_spindle_based_on_nn = set( cherrypicked_not_mitotic_spindle_based_on_nn[[ 'ID', 'cell_i' ]].apply(tuple, axis=1).values) not_mitotic_bool_idx = labels_df.index.isin( cherrypicked_not_mitotic_spindle_based_on_nn) labels_df.loc[not_mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[ not_mitotic_bool_idx, 'image_level_pred'].map(lambda x: modify_label( x, mitotic_spindle_class_i, 0)) if args.ignore_negative: raise NotImplementedError if args.upsample_minorities: cells_to_upsample = list(cherrypicked_mitotic_spindle_img_cell) aggresome_class_i = class_names.index('Aggresome') confident_aggresome_indices = list( labels_df.index[labels_df['image_level_pred'].map( lambda x: x[aggresome_class_i] > 0.9)]) print('confident_aggresome_indices len', len(confident_aggresome_indices)) print('confident_aggresome_indices[:5]', confident_aggresome_indices[:5]) cells_to_upsample += confident_aggresome_indices else: cells_to_upsample = None train_dataset = ProteinDatasetCellSeparateLoading( trn_img_paths, labels_df=labels_df, cells_to_upsample=cells_to_upsample, img_size=args.img_size, in_channels=args.in_channels, transform=train_transform, basepath_2_ohe=basepath_2_ohe_vector, normalize=args.normalize, target_raw_img_size=args.target_raw_img_size) train_loader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) # valid_dataset = ProteinDatasetCellLevel(val_img_paths, # labels_df=labels_df, # img_size=args.img_size, # batch_size=64, # is_trainset=True, # in_channels=args.in_channels) valid_dataset = ProteinDatasetCellSeparateLoading( val_img_paths, labels_df=labels_df, img_size=args.img_size, in_channels=args.in_channels, basepath_2_ohe=basepath_2_ohe_vector, normalize=args.normalize, target_raw_img_size=args.target_raw_img_size) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True) log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | train_loss/acc | valid_loss/acc/map/focal |best_epoch/best_focal| min \n' ) log.write( '-----------------------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 if args.eval_at_start: with torch.no_grad(): valid_loss, valid_acc, val_focal, val_map_score = validate( valid_loader, model, criterion, -1, log) print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_map_score, val_focal, best_epoch, best_focal, -1)) for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_acc = train( train_loader, model, criterion, optimizer, epoch, clipnorm=args.clipnorm, lr=lr, agg_steps=args.gradient_accumulation_steps) with torch.no_grad(): valid_loss, valid_acc, val_focal, val_map_score = validate( valid_loader, model, criterion, epoch, log) # remember best loss and save checkpoint is_best = val_focal < best_focal best_loss = min(valid_loss, best_loss) best_epoch = epoch if is_best else best_epoch best_focal = val_focal if is_best else best_focal print('\r', end='', flush=True) log.write('%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_map_score, val_focal, best_epoch, best_focal, (time.time() - end) / 60)) save_model(model, is_best, model_out_dir, optimizer=optimizer, epoch=epoch, best_epoch=best_epoch, best_map=best_focal)
shuffle=True) test_loader = torch.utils.data.DataLoader(data_loader(test_data_txt), batch_size=args.test_batch_size) make_if_not_exist(trained_model_dir) if args.dataset == 'PaviaU': num_cla = 9 elif args.dataset == 'Indian': num_cla = 16 else: num_cla = 13 model = DataParallel(dict[args.model_name](num_classes=num_cla, dropout_keep_prob=0)) if args.use_cuda: model.cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=1e-5) start_epoch = 0 if args.restore and len(os.listdir(trained_model_dir)): model, start_epoch = model_restore(model, trained_model_dir) train_info_record = trained_model_dir + 'train_info_' + args.model_name + '.txt' for epoch in range(start_epoch + 1, args.epochs + 1): start = time.time() train(epoch, model, train_loader, optimizer, args)
torch.set_num_threads(8) cudnn.benchmark = True # cudnn.deterministic = False cudnn.enabled = True coco_val = Coco() val_loader = DataLoader(coco_val, batch_size=test_batch_size, shuffle=False, num_workers=8, pin_memory=False) pose_net = bninception(out_chn=2) model = DataParallel(pose_net) model.cuda() checkpoint = torch.load('models/m_129.pth') pretrained_dict = checkpoint['state_dict'] model.load_state_dict(pretrained_dict) model.eval() # total_loss = 0 start_time = time.time() det_loss = 0 scale_loss = 0 rec = [] t = 0 for _, (img, imgf, meta) in enumerate(tqdm(val_loader)): with torch.no_grad(): inputs = img.cuda(non_blocking=True) output = model(inputs)
class AdvTrainer(BaseTrainer): def __init__(self, args): super(AdvTrainer, self).__init__(args) def make_model_env(self, gpu, ngpus_per_node): if self.args.distributed: self.args.gpu = self.args.devices[gpu] else: self.args.gpu = 0 if self.args.use_cuda and self.args.distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes self.args.rank = self.args.rank * ngpus_per_node + gpu dist.init_process_group(backend=self.args.dist_backend, init_method=self.args.dist_url, world_size=self.args.world_size, rank=self.args.rank) self.model = DomainQA(self.args.bert_model, self.args.num_classes, self.args.hidden_size, self.args.num_layers, self.args.dropout, self.args.dis_lambda, self.args.concat, self.args.anneal) if self.args.load_model is not None: print("Loading model from ", self.args.load_model) self.model.load_state_dict( torch.load(self.args.load_model, map_location=lambda storage, loc: storage)) if self.args.freeze_bert: for param in self.model.bert.parameters(): param.requires_grad = False max_len = max([len(f) for f in self.features_lst]) num_train_optimization_steps = math.ceil( max_len / self.args.batch_size) * self.args.epochs * len( self.features_lst) qa_params = list(self.model.bert.named_parameters()) + list( self.model.qa_outputs.named_parameters()) dis_params = list(self.model.discriminator.named_parameters()) self.qa_optimizer = get_opt(qa_params, num_train_optimization_steps, self.args) self.dis_optimizer = get_opt(dis_params, num_train_optimization_steps, self.args) if self.args.use_cuda: if self.args.distributed: torch.cuda.set_device(self.args.gpu) self.model.cuda(self.args.gpu) self.args.batch_size = int(self.args.batch_size / ngpus_per_node) self.args.workers = int( (self.args.workers + ngpus_per_node - 1) / ngpus_per_node) self.model = DistributedDataParallel( self.model, device_ids=[self.args.gpu], find_unused_parameters=True) else: self.model.cuda() self.model = DataParallel(self.model, device_ids=self.args.devices) cudnn.benchmark = True def train(self): step = 1 avg_qa_loss = 0 avg_dis_loss = 0 iter_lst = [self.get_iter(self.features_lst, self.args)] num_batches = sum([len(iterator[0]) for iterator in iter_lst]) for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): start = time.time() self.model.train() batch_step = 1 for data_loader, sampler in iter_lst: if self.args.distributed: sampler.set_epoch(epoch) for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, labels = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) qa_loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions, labels, dtype="qa", global_step=step) qa_loss = qa_loss.mean() qa_loss.backward() # update qa model avg_qa_loss = self.cal_running_avg_loss( qa_loss.item(), avg_qa_loss) self.qa_optimizer.step() self.qa_optimizer.zero_grad() # update discriminator dis_loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions, labels, dtype="dis", global_step=step) dis_loss = dis_loss.mean() dis_loss.backward() avg_dis_loss = self.cal_running_avg_loss( dis_loss.item(), avg_dis_loss) self.dis_optimizer.step() self.dis_optimizer.zero_grad() step += 1 if epoch != 0 and i % 2000 == 0: result_dict = self.evaluate_model(i) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") batch_step += 1 msg = "{}/{} {} - ETA : {} - QA loss: {:.4f}, DIS loss: {:.4f}" \ .format(batch_step, num_batches, progress_bar(batch_step, num_batches), eta(start, batch_step, num_batches), avg_qa_loss, avg_dis_loss) print(msg, end="\r") print( "[GPU Num: {}, Epoch: {}, Final QA loss: {:.4f}, Final DIS loss: {:.4f}]" .format(self.args.gpu, epoch, avg_qa_loss, avg_dis_loss)) # save model if not self.args.distributed or self.args.rank == 0: self.save_model(epoch, avg_qa_loss) if self.args.do_valid: result_dict = self.evaluate_model(epoch) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n")
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 800 # num_val_we_use = 80 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False ) val_loader = DataLoader( val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False ) model_old = multi_lstm() model_old = DataParallel(model_old) model_old.load_state_dict(torch.load( "cnn_lstm_1_epoch_25_length_10_opt_1_mulopt_1_flip_0_crop_1_batch_300_train1_9991_train2_9958_val1_9725_val2_8864.pth")) model = multi_lstm_p2t() model.share = model_old.module.share model.lstm = model_old.module.lstm model.fc = model_old.module.fc model.fc2 = model_old.module.fc2 model.fc3 = model_old.module.fc3 model = DataParallel(model) for param in model.module.fc_p2t.parameters(): param.requires_grad = False model.module.fc_p2t.load_state_dict(torch.load( "fc_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_800_train1_9951_train2_9713_val1_9686_val2_7867_p2t.pth")) if use_gpu: model = model.cuda() model.module.fc_p2t = model.module.fc_p2t.cuda() criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) criterion_3 = nn.KLDivLoss(size_average=False) sigmoid_cuda = nn.Sigmoid().cuda() if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), }, {'params': model.module.fc.parameters()}, {'params': model.module.fc2.parameters()}, {'params': model.module.fc3.parameters()}], lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), }, {'params': model.module.fc.parameters()}, {'params': model.module.fc2.parameters()}, {'params': model.module.fc3.parameters()}], lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), 'lr': learning_rate}, {'params': model.module.fc.parameters(), 'lr': learning_rate}, {'params': model.module.fc2.parameters(), 'lr': learning_rate}, {'params': model.module.fc3.parameters(), 'lr': learning_rate}, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), 'lr': learning_rate}, {'params': model.module.fc.parameters(), 'lr': learning_rate}, {'params': model.module.fc2.parameters(), 'lr': learning_rate}, {'params': model.module.fc3.parameters(), 'lr': learning_rate}, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 # 要存储2个train的准确率 2个valid的准确率 3个train 3个loss的loss, 一共10个数据要记录 record_np = np.zeros([epochs, 12]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False ) model.train() train_loss_1 = 0.0 train_loss_2 = 0.0 train_loss_3 = 0.0 train_corrects_1 = 0 train_corrects_2 = 0 train_corrects_3 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_1, outputs_2, outputs_3 = model.forward(inputs) _, preds_2 = torch.max(outputs_2.data, 1) train_corrects_2 += torch.sum(preds_2 == labels_2.data) sig_output_1 = sigmoid_cuda(outputs_1) sig_output_3 = sigmoid_cuda(outputs_3) sig_average = (sig_output_1.data + sig_output_3.data) / 2 preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5) preds_1 = preds_1.long() train_corrects_1 += torch.sum(preds_1 == labels_1.data) preds_3 = torch.cuda.ByteTensor(sig_average > 0.5) preds_3 = preds_3.long() train_corrects_3 += torch.sum(preds_3 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) sig_output_3 = Variable(sig_output_3.data, requires_grad=False) loss_3 = torch.abs(criterion_3(sig_output_1, sig_output_3)) loss = loss_1 + loss_2 + loss_3 * alpha loss.backward() optimizer.step() train_loss_1 += loss_1.data[0] train_loss_2 += loss_2.data[0] train_loss_3 += loss_3.data[0] train_elapsed_time = time.time() - train_start_time train_accuracy_1 = train_corrects_1 / num_train_all / 7 train_accuracy_2 = train_corrects_2 / num_train_all train_accuracy_3 = train_corrects_3 / num_train_all / 7 train_average_loss_1 = train_loss_1 / num_train_all / 7 train_average_loss_2 = train_loss_2 / num_train_all train_average_loss_3 = train_loss_3 / num_train_all # begin eval model.eval() val_loss_1 = 0.0 val_loss_2 = 0.0 val_loss_3 = 0.0 val_corrects_1 = 0 val_corrects_2 = 0 val_corrects_3 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1):: sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) outputs_1, outputs_2, outputs_3 = model.forward(inputs) outputs_2 = outputs_2[(sequence_length - 1):: sequence_length] _, preds_2 = torch.max(outputs_2.data, 1) val_corrects_2 += torch.sum(preds_2 == labels_2.data) sig_output_1 = sigmoid_cuda(outputs_1) sig_output_3 = sigmoid_cuda(outputs_3) sig_average = (sig_output_1.data + sig_output_3.data) / 2 preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5) preds_1 = preds_1.long() val_corrects_1 += torch.sum(preds_1 == labels_1.data) preds_3 = torch.cuda.ByteTensor(sig_average > 0.5) preds_3 = preds_3.long() val_corrects_3 += torch.sum(preds_3 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) sig_output_3 = Variable(sig_output_3.data, requires_grad=False) loss_3 = torch.abs(criterion_3(sig_output_1, sig_output_3)) val_loss_1 += loss_1.data[0] val_loss_2 += loss_2.data[0] val_loss_3 += loss_3.data[0] val_elapsed_time = time.time() - val_start_time val_accuracy_1 = val_corrects_1 / (num_val_all * 7) val_accuracy_2 = val_corrects_2 / num_val_we_use val_accuracy_3 = val_corrects_3 / (num_val_all * 7) val_average_loss_1 = val_loss_1 / (num_val_all * 7) val_average_loss_2 = val_loss_2 / num_val_we_use val_average_loss_3 = val_loss_3 / num_val_all print('epoch: {:3d}' ' train time: {:2.0f}m{:2.0f}s' ' train accu_1: {:.4f}' ' train accu_3: {:.4f}' ' train accu_2: {:.4f}' ' train loss_1: {:4.4f}' ' train loss_2: {:4.4f}' ' train loss_3: {:4.4f}' .format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_accuracy_1, train_accuracy_3, train_accuracy_2, train_average_loss_1, train_average_loss_2, train_average_loss_3)) print('epoch: {:3d}' ' valid time: {:2.0f}m{:2.0f}s' ' valid accu_1: {:.4f}' ' valid accu_3: {:.4f}' ' valid accu_2: {:.4f}' ' valid loss_1: {:4.4f}' ' valid loss_2: {:4.4f}' ' valid loss_3: {:4.4f}' .format(epoch, val_elapsed_time // 60, val_elapsed_time % 60, val_accuracy_1, val_accuracy_3, val_accuracy_2, val_average_loss_1, val_average_loss_2, val_average_loss_3)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2 + alpha * val_average_loss_3) if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95: best_val_accuracy_2 = val_accuracy_2 best_val_accuracy_1 = val_accuracy_1 correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95: if val_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_1 == best_val_accuracy_1: if train_accuracy_2 > correspond_train_acc_2: correspond_train_acc_2 = train_accuracy_2 correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) elif train_accuracy_2 == correspond_train_acc_2: if train_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy_1 record_np[epoch, 1] = train_accuracy_3 record_np[epoch, 2] = train_accuracy_2 record_np[epoch, 3] = train_average_loss_1 record_np[epoch, 4] = train_average_loss_2 record_np[epoch, 5] = train_average_loss_3 record_np[epoch, 6] = val_accuracy_1 record_np[epoch, 7] = val_accuracy_3 record_np[epoch, 7] = val_accuracy_2 record_np[epoch, 9] = val_average_loss_1 record_np[epoch, 10] = val_average_loss_2 record_np[epoch, 11] = val_average_loss_3 print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format(best_val_accuracy_1, correspond_train_acc_1)) print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format(best_val_accuracy_2, correspond_train_acc_2)) save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000)) save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000)) save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000)) save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000)) public_name = "cnn_lstm1_p2t" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train1_" + str(save_train_1) \ + "_train2_" + str(save_train_2) \ + "_val1_" + str(save_val_1) \ + "_val2_" + str(save_val_2) model_name = public_name + ".pth" torch.save(best_model_wts, model_name) record_name = public_name + ".npy" np.save(record_name, record_np)
def main(): args = parse_args() # __________________ Params ___________________ sample_size = args.sample_size train_batch_size = args.train_batch_size test_batch_size = 128 num_epochs = args.num_epochs num_workers = args.num_worker lr = args.learning_rate start_epoch = 0 # _____________________________________________ manual_seed = random.randint(1, 10000) random.seed(manual_seed) torch.manual_seed(manual_seed) torch.cuda.manual_seed_all(manual_seed) torch.set_num_threads(num_workers + 1) cudnn.benchmark = True # cudnn.deterministic = False cudnn.enabled = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) coco_train = Coco( 'datasets/data/coco_data/person_keypoints_train2017.json', 'train', sample_size, transforms.Compose([normalize])) coco_val = Coco('datasets/data/coco_data/person_keypoints_val2017.json', 'train', sample_size, transforms.Compose([normalize])) train_loader = DataLoader(coco_train, batch_size=train_batch_size, shuffle=True, num_workers=num_workers, pin_memory=False) val_loader = DataLoader(coco_val, batch_size=test_batch_size, shuffle=False, num_workers=num_workers, pin_memory=False) pose_net = bninception(out_chn=2) model = DataParallel(pose_net) model.cuda() #checkpoint = torch.load('models/m_100.pth') #pretrained_dict = checkpoint['state_dict'] #model.load_state_dict(pretrained_dict) optimizer = torch.optim.Adam(model.parameters(), lr=lr) #os.makedirs(args.log, exist_ok=True) for epoch in range(start_epoch, num_epochs): if epoch == 100: for param_group in optimizer.param_groups: param_group['lr'] = 1e-4 dloss, scale_loss = train(train_loader, model, optimizer) tloss = 'det_loss ' + str(dloss) + ' scale loss ' + str(scale_loss) dloss, scale_loss = train_test(val_loader, model) test_loss = 'det_loss ' + str(dloss) + ' scale loss ' + str(scale_loss) with open('losses/train_loss_384.txt', 'a') as the_file: the_file.write(str(tloss) + '\n') with open('losses/test_loss_384.txt', 'a') as the_file: the_file.write(str(test_loss) + '\n') ckpt = { 'epoch': epoch, 'optimizer': optimizer.state_dict(), 'state_dict': model.state_dict() } #os.makedirs(args.model_save_path, exist_ok=True) #ckpt_name = os.path.join(args.model_save_path, 'epoch_%d.ckpt' % epoch) torch.save(ckpt, 'models/m_384_' + str(epoch) + '.pth')
def run(args): with open(args.cfg_path) as f: cfg = json.load(f) if not os.path.exists(args.save_path): os.mkdir(args.save_path) batch_size_train = cfg['batch_size'] batch_size_valid = cfg['batch_size'] * 3 num_workers = args.num_workers grid_size = cfg['grid_size'] logger = logging.getLogger("valid") logger.setLevel(logging.DEBUG) fileHanlder = logging.FileHandler('valid.log') formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fileHanlder.setFormatter(formatter) logger.addHandler(fileHanlder) model = MODELS[cfg['model']](num_nodes=grid_size, use_crf=cfg['use_crf']) model = DataParallel(model, device_ids=None) checkpoint = torch.load(args.load_path) model.load_state_dict(checkpoint['state_dict']) model = model.cuda() loss_fn = BCEWithLogitsLoss().cuda() optimizer = SGD(model.parameters(), lr=cfg['lr'], momentum=cfg['momentum']) dataset_train = GridPatchTrainDataset(csv_file=cfg['data_path_train'], root_dir=cfg['root_dir']) dataset_valid = GridPatchValidnDataset(csv_file=cfg['data_path_valid'], root_dir=cfg['root_dir']) dataloader_train = DataLoader(dataset_train, batch_size=batch_size_train, shuffle=True, num_workers=num_workers) dataloader_valid = DataLoader(dataset_valid, batch_size=batch_size_valid, num_workers=num_workers) summary_train = {'epoch': 0, 'step': 0} summary_valid = {'loss': float('inf'), 'acc': 0} summary_writer = SummaryWriter(comment='NCRF') loss_valid_best = 1000.0 for epoch in range(cfg['epoch']): if epoch == 5: for param_group in optimizer.param_groups: param_group['lr'] = cfg['lr_1'] if epoch == 10: for param_group in optimizer.param_groups: param_group['lr'] = cfg['lr_2'] if epoch == 15: for param_group in optimizer.param_groups: param_group['lr'] = cfg['lr_3'] summary_train = train_epoch(summary_train, summary_writer, cfg, model, loss_fn, optimizer, dataloader_train) fileName = 'train_' + str(summary_train['epoch']) + '.pkl' torch.save( { 'epoch': summary_train['epoch'], 'step': summary_train['step'], 'state_dict': model.state_dict() }, os.path.join(args.save_path, fileName)) time_now = time.time() summary_valid = valid_epoch(summary_valid, cfg, model, loss_fn, dataloader_valid) time_spent = time.time() - time_now logger.info('{}, Epoch : {}, Step : {}, Validation Loss : {:.5f}, ' 'Validation Acc : {:.3f}, Run Time : {:.2f}'.format( time.strftime("%Y-%m-%d %H:%M:%S"), summary_train['epoch'], summary_train['step'], summary_valid['loss'], summary_valid['acc'], time_spent)) summary_writer.add_scalar('valid/loss', summary_valid['loss'], summary_train['step']) summary_writer.add_scalar('valid/acc', summary_valid['acc'], summary_train['step']) if summary_valid['loss'] < loss_valid_best: loss_valid_best = summary_valid['loss'] torch.save( { 'epoch': summary_train['epoch'], 'step': summary_train['step'], 'state_dict': model.state_dict() }, os.path.join(args.save_path, 'best.pkl')) summary_writer.close()
class Train: def __init__(self, model, data, epoch, batch_size, loss, loss_params, ops_params, lr=5e-4, optimizer='adam', mode='parallel', continue_train=False, save=None): self.model = model self.data = data self.epoch = epoch self.batch_size = batch_size self.loss = loss self.loss_params = loss_params self.ops_params = ops_params self.lr = lr self.optimizer = optimizer self.mode = mode self.continue_train = continue_train self.save = save def _train(self): if self.mode == 'gpu': device = torch.device('cuda', 0) if self.continue_train == True: self.model.load_state_dict(torch.load(self.save)) self.model = self.model.to(device) elif self.mode == 'parallel': num_gpu = torch.cuda.device_count() self.model = DataParallel(self.model, device_ids=[i for i in range(num_gpu)]) if self.continue_train == True: self.model.load_state_dict(torch.load(self.save)) self.model = self.model.cuda() self.model = self.model.train() params = self.model.parameters() optimizer = self._create_optimizer() optimizer = optimizer(params, lr=self.lr, **self.ops_params) start_time = int(time.time()) log = open('./logs/loggings/LaneNet_{}.txt'.format(start_time), 'w') step = 0 for e_p in range(self.epoch): for batch_data in self.data['train']: s = time.time() input_data = batch_data[0] seg_mask = batch_data[1] instance_mask = batch_data[2] input_data = input_data.cuda() seg_mask = seg_mask.cuda() instance_mask = instance_mask.cuda() predictions, embeddings = self.model(input_data) total_loss = self.loss(self.batch_size, predictions, seg_mask, embeddings, instance_mask, **self.loss_params) total_loss, segmentation_loss, discriminative_loss = total_loss( ) log.write( 'Steps:{}, Total Loss:{}, Segmentation Loss:{}, Discriminative Loss:{}\n' .format(step, total_loss, segmentation_loss, discriminative_loss)) log.flush() optimizer.zero_grad() total_loss.backward() clip_grad_value_(params, clip_value=5.) optimizer.step() step += 1 e = time.time() print( "step time:{}, seg_loss:{:.6f}, dis_loss:{:.6f}\n".format( e - s, segmentation_loss, discriminative_loss)) torch.save( self.model.state_dict(), os.path.join('./logs/models', 'model_1_{}_{}.pkl'.format(start_time, e_p))) log.close() def _create_optimizer(self): if self.optimizer == 'adam': return torch.optim.Adam elif self.optimizer == 'sgd': return torch.optim.SGD def __call__(self): self._train()
def test_model(test_dataset, test_num_each): num_test = len(test_dataset) test_useful_start_idx = get_useful_start_idx(sequence_length, test_num_each) num_test_we_use = len(test_useful_start_idx) # num_test_we_use = 804 # num_test_we_use = len(test_useful_start_idx) // (test_batch_size // sequence_length) * ( # test_batch_size // sequence_length) test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use] test_idx = [] for i in range(num_test_we_use): for j in range(sequence_length): test_idx.append(test_we_use_start_idx[i] + j) num_test_all = len(test_idx) print('num test start idx : {:6d}'.format(len(test_useful_start_idx))) print('last idx test start: {:6d}'.format(test_useful_start_idx[-1])) print('num of test dataset: {:6d}'.format(num_test)) print('num of test we use : {:6d}'.format(num_test_we_use)) print('num of all test use: {:6d}'.format(num_test_all)) test_loader = DataLoader(test_dataset, batch_size=test_batch_size, sampler=test_idx, num_workers=workers, pin_memory=False) model = resnet_lstm() model.load_state_dict(torch.load(model_name)) model = model.module model = DataParallel(model) if use_gpu: model = model.cuda() # 应该可以直接多gpu计算 # model = model.module #要测试一下 criterion = nn.CrossEntropyLoss(size_average=False) model.eval() test_loss = 0.0 test_corrects = 0 test_start_time = time.time() all_preds = [] for data in test_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels = Variable(labels_2, volatile=True) outputs = model.forward(inputs) outputs = outputs.view(-1, sequence_length, 7) outputs = torch.mean(outputs, 1) _, preds = torch.max(outputs.data, 1) for i in range(len(preds)): all_preds.append(preds[i]) print(len(all_preds)) loss = criterion(outputs, labels) test_loss += loss.data[0] test_corrects += torch.sum(preds == labels.data) test_elapsed_time = time.time() - test_start_time test_accuracy = test_corrects / num_test_we_use test_average_loss = test_loss / num_test_we_use print('type of all_preds:', type(all_preds)) print('leng of all preds:', len(all_preds)) with open(pred_name, 'wb') as f: pickle.dump(all_preds, f) print('test elapsed: {:2.0f}m{:2.0f}s' ' test loss: {:4.4f}' ' test accu: {:.4f}'.format(test_elapsed_time // 60, test_elapsed_time % 60, test_average_loss, test_accuracy))
def train(self): torch.multiprocessing.set_sharing_strategy('file_system') path = self.args.data_path label_file = self.args.label_path self.logger.info('original train process') time_stamp_launch = time.strftime('%Y%m%d') + '-' + time.strftime( '%H%M') self.logger.info(path.split('/')[-2] + time_stamp_launch) best_acc = 0 model_root = './model_' + path.split('/')[-2] if not os.path.exists(model_root): os.mkdir(model_root) cuda = True cudnn.benchmark = True batch_size = self.args.batchsize batch_size_g = batch_size * 2 image_size = (224, 224) num_cls = self.args.num_class self.generator_epoch = self.args.generator_epoch self.warm_epoch = 10 n_epoch = self.args.max_epoch weight_decay = 1e-6 momentum = 0.9 manual_seed = random.randint(1, 10000) random.seed(manual_seed) torch.manual_seed(manual_seed) ####################### # load data # ####################### target_train = transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop((224, 224)), transforms.RandomHorizontalFlip(), AutoAugment(), transforms.ToTensor(), transforms.Normalize((0.435, 0.418, 0.396), (0.284, 0.308, 0.335)), # grayscale mean/std ]) dataset_train = visDataset_target(path, label_file, train=True, transform=target_train) dataloader_train = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, num_workers=3) transform_test = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.435, 0.418, 0.396), (0.284, 0.308, 0.335)), # grayscale mean/std ]) test_dataset = visDataset_target(path, label_file, train=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=3) ##################### # load model # ##################### self.lemniscate = LinearAverage(2048, test_dataset.__len__(), 0.05, 0.00).cuda() self.elr_loss = elr_loss(num_examp=test_dataset.__len__(), num_classes=12).cuda() generator = generator_fea_deconv(class_num=num_cls) discriminator = Discriminator_fea() source_net = torch.load(self.args.source_model_path) source_classifier = Classifier(num_classes=num_cls) fea_contrastor = contrastor() # load pre-trained source classifier fc_dict = source_classifier.state_dict() pre_dict = source_net.state_dict() pre_dict = {k: v for k, v in pre_dict.items() if k in fc_dict} fc_dict.update(pre_dict) source_classifier.load_state_dict(fc_dict) generator = DataParallel(generator, device_ids=[0, 1]) discriminator = DataParallel(discriminator, device_ids=[0, 1]) fea_contrastor = DataParallel(fea_contrastor, device_ids=[0, 1]) source_net = DataParallel(source_net, device_ids=[0, 1]) source_classifier = DataParallel(source_classifier, device_ids=[0, 1]) source_classifier.eval() for p in generator.parameters(): p.requires_grad = True for p in source_net.parameters(): p.requires_grad = True # freezing the source classifier for name, value in source_net.named_parameters(): if name[:9] == 'module.fc': value.requires_grad = False # setup optimizer params = filter(lambda p: p.requires_grad, source_net.parameters()) discriminator_group = [] for k, v in discriminator.named_parameters(): discriminator_group += [{'params': v, 'lr': self.lr * 3}] model_params = [] for v in params: model_params += [{'params': v, 'lr': self.lr}] contrastor_para = [] for k, v in fea_contrastor.named_parameters(): contrastor_para += [{'params': v, 'lr': self.lr * 5}] ##################### # setup optimizer # ##################### # only train the extractor optimizer = optim.SGD(model_params + discriminator_group + contrastor_para, momentum=momentum, weight_decay=weight_decay) optimizer_g = optim.SGD(generator.parameters(), lr=self.lr, momentum=momentum, weight_decay=weight_decay) loss_gen_ce = torch.nn.CrossEntropyLoss() if cuda: source_net = source_net.cuda() generator = generator.cuda() discriminator = discriminator.cuda() fea_contrastor = fea_contrastor.cuda() loss_gen_ce = loss_gen_ce.cuda() source_classifier = source_classifier.cuda() ############################# # training network # ############################# len_dataloader = len(dataloader_train) self.logger.info('the step of one epoch: ' + str(len_dataloader)) current_step = 0 for epoch in range(n_epoch): source_net.train() discriminator.train() fea_contrastor.train() data_train_iter = iter(dataloader_train) if epoch < self.generator_epoch: generator.train() self.train_prototype_generator(epoch, batch_size_g, num_cls, optimizer_g, generator, source_classifier, loss_gen_ce) if epoch >= self.generator_epoch: if epoch == self.generator_epoch: torch.save( generator, model_root + '/generator_' + path.split('/')[-2] + '.pkl') # prototype generation generator.eval() z = Variable(torch.rand(self.args.num_class * 2, 100)).cuda() # Get labels ranging from 0 to n_classes for n rows label_t = torch.linspace(0, num_cls - 1, steps=num_cls).long() for ti in range(self.args.num_class * 2 // num_cls - 1): label_t = torch.cat([ label_t, torch.linspace(0, num_cls - 1, steps=num_cls).long() ]) labels = Variable(label_t).cuda() z = z.contiguous() labels = labels.contiguous() images = generator(z, labels) self.alpha = 0.9 - (epoch - self.generator_epoch) / ( n_epoch - self.generator_epoch) * 0.2 # obtain the target pseudo label and confidence weight pseudo_label, pseudo_label_acc, all_indx, confidence_weight = self.obtain_pseudo_label_and_confidence_weight( test_loader, source_net) i = 0 while i < len_dataloader: ################################### # prototype adaptation # ################################### p = float(i + (epoch - self.generator_epoch) * len_dataloader ) / (n_epoch - self.generator_epoch) / len_dataloader self.p = 2. / (1. + np.exp(-10 * p)) - 1 data_target_train = data_train_iter.next() s_img, s_label, s_indx = data_target_train batch_size_s = len(s_label) input_img_s = torch.FloatTensor(batch_size_s, 3, image_size[0], image_size[1]) class_label_s = torch.LongTensor(batch_size_s) if cuda: s_img = s_img.cuda() s_label = s_label.cuda() input_img_s = input_img_s.cuda() class_label_s = class_label_s.cuda() input_img_s.resize_as_(s_img).copy_(s_img) class_label_s.resize_as_(s_label).copy_(s_label) target_inputv_img = Variable(input_img_s) target_classv_label = Variable(class_label_s) # learning rate decay optimizer = self.exp_lr_scheduler(optimizer=optimizer, step=current_step) loss, contrastive_loss = self.adaptation_step( target_inputv_img, pseudo_label, images.detach(), labels, s_indx.numpy(), source_net, discriminator, fea_contrastor, optimizer, epoch, confidence_weight.float()) # visualization on tensorboard self.writer.add_scalar('contrastive_loss', contrastive_loss, global_step=current_step) self.writer.add_scalar('overall_loss', loss, global_step=current_step) self.writer.add_scalar('pseudo_label_acc', pseudo_label_acc, global_step=current_step) i += 1 current_step += 1 self.logger.info('epoch: %d' % epoch) self.logger.info('contrastive_loss: %f' % (contrastive_loss)) self.logger.info('loss: %f' % loss) accu, ac_list = val_pclass(source_net, test_loader) self.writer.add_scalar('test_acc', accu, global_step=current_step) self.logger.info(ac_list) if accu >= best_acc: self.logger.info('saving the best model!') torch.save( source_net, model_root + '/' + time_stamp_launch + '_best_model_' + path.split('/')[-2] + '.pkl') best_acc = accu self.logger.info('acc is : %.04f, best acc is : %.04f' % (accu, best_acc)) self.logger.info( '================================================') self.logger.info('training done! ! !')
def main(): args = parser.parse_args() log_out_dir = opj(RESULT_DIR, 'logs', args.out_dir, f'fold{args.fold}') if not ope(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(opj(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = opj(RESULT_DIR, 'models', args.out_dir, f'fold{args.fold}') log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not ope(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model = init_network(model_params) # move network to gpu model = DataParallel(model) model.cuda() if args.ema: ema_model = copy.deepcopy(model) ema_model.cuda() else: ema_model = None # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_epoch = 0 best_dice = 0 best_dice_arr = np.zeros(3) # define scheduler try: scheduler = eval(args.scheduler)() except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # optionally resume from a checkpoint if args.resume: model_fpath = os.path.join(model_out_dir, args.resume) if os.path.isfile(model_fpath): # load checkpoint weights and update model and optimizer log.write(">> Loading checkpoint:\n>> '{}'\n".format(model_fpath)) checkpoint = torch.load(model_fpath) start_epoch = checkpoint['epoch'] best_epoch = checkpoint['best_epoch'] best_dice_arr = checkpoint['best_dice_arr'] best_dice = np.max(best_dice_arr) model.module.load_state_dict(checkpoint['state_dict']) optimizer_fpath = model_fpath.replace('.pth', '_optim.pth') if ope(optimizer_fpath): log.write(">> Loading checkpoint:\n>> '{}'\n".format( optimizer_fpath)) optimizer.load_state_dict( torch.load(optimizer_fpath)['optimizer']) if args.ema: ema_model_fpath = model_fpath.replace('.pth', '_ema.pth') if ope(ema_model_fpath): log.write(">> Loading checkpoint:\n>> '{}'\n".format( ema_model_fpath)) ema_model.module.load_state_dict( torch.load(ema_model_fpath)['state_dict']) log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format( model_fpath, checkpoint['epoch'])) else: log.write(">> No checkpoint found at '{}'\n".format(model_fpath)) # Data loading code train_transform = eval(args.train_transform) steel_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv')) steel_df['ImageId'], steel_df['ClassId'] = zip( *steel_df['ImageId_ClassId'].apply(lambda x: x.split('_'))) steel_df = pd.pivot_table(steel_df, index='ImageId', columns='ClassId', values='EncodedPixels', aggfunc=lambda x: x, dropna=False) steel_df = steel_df.reset_index() steel_df.columns = [str(i) for i in steel_df.columns.values] steel_df['class_count'] = steel_df[['1', '2', '3', '4']].count(axis=1) steel_df['split_label'] = steel_df[['1', '2', '3', '4', 'class_count' ]].apply(lambda x: make_split_label(x), axis=1) train_idx, valid_idx, _, _ = train_test_split(steel_df.index, steel_df['split_label'], test_size=0.2, random_state=43) train_dataset = SteelDataset( steel_df.iloc[train_idx], img_size=args.img_size, mask_size=args.img_size, transform=train_transform, return_label=True, dataset='train', ) if args.is_balance: train_sampler = BalanceClassSampler( train_dataset, args.sample_times * len(train_dataset)) else: train_sampler = RandomSampler(train_dataset) train_loader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.batch_size, drop_last=True, num_workers=args.workers, pin_memory=True, ) # valid_split_file = opj(DATA_DIR, args.split_type, args.split_name, 'random_valid_cv%d.csv' % args.fold) valid_dataset = SteelDataset( steel_df.iloc[valid_idx], img_size=args.img_size, mask_size=args.img_size, transform=None, return_label=True, dataset='val', ) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=max(int(args.batch_size // 2), 1), drop_last=False, num_workers=args.workers, pin_memory=True) log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | smooth_loss/dice | valid_loss/dice | best_epoch/best_score | min \n' ) log.write( '------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_dice = train(train_loader, model, ema_model, criterion, optimizer, epoch, args, lr=lr) with torch.no_grad(): if args.ema: valid_loss, valid_dice = validate(valid_loader, ema_model, criterion, epoch) else: valid_loss, valid_dice = validate(valid_loader, model, criterion, epoch) # remember best loss and save checkpoint is_best = valid_dice >= best_dice if is_best: best_epoch = epoch best_dice = valid_dice if args.ema: save_top_epochs(model_out_dir, ema_model, best_dice_arr, valid_dice, best_epoch, epoch, best_dice, ema=True) best_dice_arr = save_top_epochs(model_out_dir, model, best_dice_arr, valid_dice, best_epoch, epoch, best_dice, ema=False) print('\r', end='', flush=True) log.write('%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f | %6.1f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_dice, valid_loss, valid_dice, best_epoch, best_dice, (time.time() - end) / 60)) model_name = '%03d' % epoch if args.ema: save_model(ema_model, model_out_dir, epoch, model_name, best_dice_arr, is_best=is_best, optimizer=optimizer, best_epoch=best_epoch, best_dice=best_dice, ema=True) save_model(model, model_out_dir, epoch, model_name, best_dice_arr, is_best=is_best, optimizer=optimizer, best_epoch=best_epoch, best_dice=best_dice, ema=False)
def train(args, model, optimizer, dataloader, dataloader_val, dataset_size, num_epochs, save_path): best_acc = float('inf') logging.basicConfig(level=logging.INFO, format='%(message)s', filename='log/regAASCE.log', filemode='a') logging.info(f'regAASCE Using Densent\n') gpu_nums = len(args.gpu_devices.split(',')) print('Using %d gpus' % gpu_nums) if gpu_nums > 1: model = DataParallel(model) model.cuda() for epoch in range(num_epochs): s1 = time.time() print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) for phase in ['train', 'val']: if phase == 'train': model.train() else: model.eval() runing_loss = 0.0 train_num = 0 if phase == 'train': with tqdm(total=dataset_size, desc=f'Epoch{epoch + 1}/{num_epochs}', unit='img') as pbar: for i_batch, sample_batched in enumerate(dataloader): inputs = Variable( sample_batched['image'].float()).cuda() label = Variable(sample_batched['angle']).cuda() img_name = sample_batched['name'] optimizer.zero_grad() # forward outputs = model(inputs) train_num += len(inputs) loss = torch.mean(torch.abs(label - outputs)) # backward loss.backward() optimizer.step() pbar.set_postfix(**{'loss (batch)': loss.item()}) pbar.update(sample_batched['image'].shape[0]) runing_loss += loss * inputs.size(0) epoch_loss = runing_loss / dataset_size print('epoch: %d, loss %.5f' % (epoch, epoch_loss)) s2 = time.time() print('Train complete in %.0f m %.0f s' % ((s2 - s1) // 60, (s2 - s1) % 60)) logging.info( f'{epoch + 1}/{num_epochs} loss: {epoch_loss}') else: error = val(model, dataloader_val, args) printline = 'Mean error: %.4f' % error print(printline) logging.info(printline) if error < best_acc: best_acc = error if gpu_nums > 1: torch.save( model.module.state_dict(), os.path.join( save_path, '3DUnet_%d_%.4f.pth' % (epoch + 1, best_acc))) else: torch.save( model.state_dict(), os.path.join( save_path, '3DUnet_%d_%.4f.pth' % (epoch + 1, best_acc)))
def main(): args = parser.parse_args() log_out_dir = opj(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not ope(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(opj(log_out_dir, 'log.submit.txt'), mode='a') if args.ema: network_path = opj(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold, '%s_ema.pth' % args.predict_epoch) else: network_path = opj(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold, '%s.pth' % args.predict_epoch) submit_out_dir = opj(RESULT_DIR, 'submissions', args.out_dir, 'fold%d' % args.fold, 'epoch_%s' % args.predict_epoch) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( submit_out_dir)) if not ope(submit_out_dir): os.makedirs(submit_out_dir) # setting up the visible GPU os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id args.augment = args.augment.split(',') for augment in args.augment: if augment not in augment_list: raise ValueError( 'Unsupported or unknown test augmentation: {}!'.format( augment)) model_params = {} model_params['architecture'] = args.arch model = init_network(model_params) log.write(">> Loading network:\n>>>> '{}'\n".format(network_path)) checkpoint = torch.load(network_path) model.load_state_dict(checkpoint['state_dict']) log.write(">>>> loaded network:\n>>>> epoch {}\n".format( checkpoint['epoch'])) # moving network to gpu and eval mode model = DataParallel(model) model.cuda() model.eval() # Data loading code dataset = args.dataset if dataset == 'train': test_split_file = opj(DATA_DIR, args.split_type, 'train.csv') elif dataset == 'test': test_split_file = opj(DATA_DIR, args.split_type, 'test.csv') elif dataset == 'val': test_split_file = opj(DATA_DIR, args.split_type, args.split_name, 'random_valid_cv%d.csv' % args.fold) elif dataset == 'nih': test_split_file = opj(DATA_DIR, args.split_type, 'nih_112120.csv') elif dataset == 'chexpert': test_split_file = opj(DATA_DIR, args.split_type, 'chexpert_188521.csv') else: raise ValueError('Unsupported or unknown dataset: {}!'.format(dataset)) test_dataset = SiimDataset( test_split_file, img_size=args.img_size, mask_size=args.img_size, transform=None, return_label=False, crop_version=args.crop_version, dataset=args.dataset, predict_pos=args.predict_pos, ) test_loader = DataLoader( test_dataset, sampler=SequentialSampler(test_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) for augment in args.augment: test_loader.dataset.transform = eval('augment_%s' % augment) unaugment_func = eval('unaugment_%s' % augment) sub_submit_out_dir = opj(submit_out_dir, augment) if not ope(sub_submit_out_dir): os.makedirs(sub_submit_out_dir) with torch.no_grad(): predict(test_loader, model, sub_submit_out_dir, dataset, args, unaugment_func=unaugment_func)
def main(): args = parser.parse_args() log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not os.path.exists(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not os.path.exists(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device if not args.all_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # cudnn.enabled = False # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model_params['num_classes'] = 1 model_params['in_channels'] = args.in_channels if 'efficientnet' in args.arch: model_params['image_size'] = args.img_size model_params['encoder'] = args.effnet_encoder model = init_network(model_params) if args.load_state_dict_path is not None: if args.load_state_dict_path == 'use-img-level-densenet-ckpt': model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6' pretrained_ckpt_path = os.path.join(f'{model_dir}', f'fold{args.fold}', 'final.pth') else: pretrained_ckpt_path = args.load_state_dict_path init_pretrained = torch.load(pretrained_ckpt_path) if args.load_as_is: model.load_state_dict(init_pretrained['state_dict']) else: model.load_state_dict({ key: (val if key not in {'logit.weight', 'logit.bias'} else torch.rand([1, 1024] if key == 'logit.weight' else [1])) for key, val in init_pretrained['state_dict'].items() }) torch.nn.init.xavier_uniform(model.logit.weight) if args.all_gpus: model = DataParallel(model) model.cuda() # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_loss = 1e5 best_epoch = 0 best_val_pr_auc_score = 0 # define scheduler try: scheduler = eval(args.scheduler)( scheduler_lr_multiplier=args.scheduler_lr_multiplier, scheduler_epoch_offset=args.scheduler_epoch_offset) except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # Data loading code train_transform = train_multi_augment2 with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f: folds = pickle.load(f) fold = args.fold trn_img_paths, val_img_paths = folds[fold] train_df = get_train_df_ohe(clean_from_duplicates=True) basepath_2_ohe_vector = { img: vec for img, vec in zip(train_df['img_base_path'], train_df.iloc[:, 2:].values) } public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True) public_basepath_2_ohe_vector = { img_path: vec for img_path, vec in zip(public_hpa_df_17['img_base_path'], public_hpa_df_17.iloc[:, 2:].values) } basepath_2_ohe_vector.update(public_basepath_2_ohe_vector) available_paths = set( np.concatenate((train_df['img_base_path'].values, public_hpa_df_17['img_base_path'].values))) trn_img_paths = [path for path in trn_img_paths if path in available_paths] val_img_paths = [path for path in val_img_paths if path in available_paths] labels_df = pd.read_hdf(args.cell_level_labels_path) # modifying minor class labels cherrypicked_mitotic_spindle = pd.read_csv( '../input/mitotic_cells_selection.csv') cherrypicked_mitotic_spindle_img_cell = set( cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple, axis=1).values) cherrypicked_mitotic_spindle_img_cell = { (img, cell_i - 1) for img, cell_i in cherrypicked_mitotic_spindle_img_cell } class_names = get_class_names() mitotic_spindle_class_i = class_names.index('Mitotic spindle') cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_pos_nn_added.csv') cherrypicked_mitotic_spindle_img_cell.update( set(cherrypicked_mitotic_spindle_based_on_nn[['ID', 'cell_i' ]].apply(tuple, axis=1).values)) mitotic_bool_idx = labels_df.index.isin( cherrypicked_mitotic_spindle_img_cell) negative_img_ids_cell = labels_df.index[np.logical_not( mitotic_bool_idx)].values dfs = [] for fold in range(5): dfs.append(pd.read_csv(f'../output/mitotic_pred_fold_{fold}.csv')) pred_df = pd.concat(dfs) pred_df.set_index(['ID', 'cell_i'], inplace=True) positive_img_ids_cell = pred_df.index[pred_df['pred'] < 0.6].values if args.ignore_negative: raise NotImplementedError train_dataset = ProteinMitoticDatasetCellSeparateLoading( trn_img_paths, positive_img_ids_cell, negative_img_ids_cell, in_channels=args.in_channels, transform=train_transform, target_raw_img_size=args.target_raw_img_size) train_loader = DataLoader( train_dataset, sampler=MitoticBalancingSubSampler(train_dataset.img_ids_cell, train_dataset.id_cell_2_y), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) valid_dataset = ProteinMitoticDatasetCellSeparateLoading( val_img_paths, positive_img_ids_cell, sample(list(negative_img_ids_cell), 10000), img_size=args.img_size, in_channels=args.in_channels, target_raw_img_size=args.target_raw_img_size) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True) log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | train_loss/acc | valid_loss/acc/pr_auc/--- |best_epoch/best_pr_auc| min \n' ) log.write( '-----------------------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 if args.eval_at_start: with torch.no_grad(): valid_loss, valid_acc, val_pr_auc_score = validate( valid_loader, model, criterion, -1, log) print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_pr_auc_score, -1, best_epoch, -1, -1)) for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_acc = train( train_loader, model, criterion, optimizer, epoch, clipnorm=args.clipnorm, lr=lr, agg_steps=args.gradient_accumulation_steps) with torch.no_grad(): valid_loss, valid_acc, val_pr_auc_score = validate( valid_loader, model, criterion, epoch, log) # remember best loss and save checkpoint is_best = val_pr_auc_score > best_val_pr_auc_score best_loss = min(valid_loss, best_loss) best_epoch = epoch if is_best else best_epoch best_val_pr_auc_score = val_pr_auc_score if is_best else best_val_pr_auc_score print('\r', end='', flush=True) log.write('%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_pr_auc_score, -1, best_epoch, best_val_pr_auc_score, (time.time() - end) / 60)) save_model(model, is_best, model_out_dir, optimizer=optimizer, epoch=epoch, best_epoch=best_epoch, best_map=best_val_pr_auc_score)
class Session: def __init__(self, dt_split): torch.manual_seed(66) torch.cuda.manual_seed_all(66) torch.cuda.set_device(settings.DEVICE) self.log_dir = settings.LOG_DIR self.model_dir = settings.MODEL_DIR ensure_dir(self.log_dir) ensure_dir(self.model_dir) logger.info('set log dir as %s' % self.log_dir) logger.info('set model dir as %s' % self.model_dir) self.step = 1 self.writer = SummaryWriter(osp.join(self.log_dir, 'train.events')) dataset = TrainDataset(split=dt_split) self.dataloader = DataLoader(dataset, batch_size=settings.BATCH_SIZE, pin_memory=True, num_workers=settings.NUM_WORKERS, shuffle=True, drop_last=True) self.crit = nn.CrossEntropyLoss(ignore_index=settings.IGNORE_LABEL, \ reduction='mean') self.net = EMANet(settings.N_CLASSES, settings.N_LAYERS) self.opt = SGD(params=[{ 'params': get_params(self.net, key='1x'), 'lr': 1 * settings.LR, 'weight_decay': settings.WEIGHT_DECAY, }, { 'params': get_params(self.net, key='1y'), 'lr': 1 * settings.LR, 'weight_decay': 0, }, { 'params': get_params(self.net, key='2x'), 'lr': 2 * settings.LR, 'weight_decay': 0.0, }], momentum=settings.LR_MOM) self.net = DataParallel(self.net, device_ids=settings.DEVICES) patch_replication_callback(self.net) self.net = self.net.cuda() def write(self, out): for k, v in out.items(): self.writer.add_scalar(k, v, self.step) out['lr'] = self.opt.param_groups[0]['lr'] out['step'] = self.step outputs = ['{}: {:.4g}'.format(k, v) for k, v in out.items()] logger.info(' '.join(outputs)) def save_checkpoints(self, name): ckp_path = osp.join(self.model_dir, name) obj = { 'net': self.net.module.state_dict(), 'step': self.step, } torch.save(obj, ckp_path) def load_checkpoints(self, name): ckp_path = osp.join(self.model_dir, name) try: obj = torch.load(ckp_path, map_location=lambda storage, loc: storage.cuda()) logger.info('Load checkpoint %s' % ckp_path) except FileNotFoundError: logger.error('No checkpoint %s!' % ckp_path) return self.net.module.load_state_dict(obj['net']) self.step = obj['step'] def train_batch(self, image, label): loss, mu = self.net(image, label) with torch.no_grad(): mu = mu.mean(dim=0, keepdim=True) momentum = settings.EM_MOM self.net.module.emau.mu *= momentum self.net.module.emau.mu += mu * (1 - momentum) loss = loss.mean() self.opt.zero_grad() #loss = self.crit(pred, label.long()) loss.backward() self.opt.step() return loss.item()
train_loader = VCRLoader.from_dataset(train, **loader_params) val_loader = VCRLoader.from_dataset(val, **loader_params) test_loader = VCRLoader.from_dataset(test, **loader_params) ARGS_RESET_EVERY = 100 print("Loading {} for {}".format(params['model'].get('type', 'WTF?'), 'rationales' if args.rationale else 'answer'), flush=True) model = Model.from_params(vocab=train.vocab, params=params['model']) for submodule in model.detector.backbone.modules(): if isinstance(submodule, BatchNorm2d): submodule.track_running_stats = False for p in submodule.parameters(): p.requires_grad = False model = DataParallel(model).cuda() if NUM_GPUS > 1 else model.cuda() optimizer = Optimizer.from_params( [x for x in model.named_parameters() if x[1].requires_grad], params['trainer']['optimizer']) lr_scheduler_params = params['trainer'].pop("learning_rate_scheduler", None) scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) if lr_scheduler_params else None if os.path.exists(args.folder): print("Found folder! restoring", flush=True) start_epoch, val_metric_per_epoch = restore_checkpoint( model, optimizer, serialization_dir=args.folder, learning_rate_scheduler=scheduler)
def main(): args = parser.parse_args() log_out_dir = opj(RESULT_DIR, 'logs', args.out_dir, f'fold{args.fold}') if not ope(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(opj(log_out_dir, 'log.submit.txt'), mode='a') if args.ema: network_path = opj(RESULT_DIR, 'models', args.out_dir, f'fold{args.fold}', f'{args.predict_epoch}_ema.pth') else: network_path = opj(RESULT_DIR, 'models', args.out_dir, f'fold{args.fold}', f'{args.predict_epoch}.pth') submit_out_dir = opj(RESULT_DIR, 'submissions', args.out_dir, f'fold{args.fold}', f'epoch_{args.predict_epoch}') log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( submit_out_dir)) if not ope(submit_out_dir): os.makedirs(submit_out_dir) # setting up the visible GPU os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id args.augment = args.augment.split(',') for augment in args.augment: if augment not in augment_list: raise ValueError( 'Unsupported or unknown test augmentation: {}!'.format( augment)) model_params = {} model_params['architecture'] = args.arch model = init_network(model_params) log.write(">> Loading network:\n>>>> '{}'\n".format(network_path)) checkpoint = torch.load(network_path) model.load_state_dict(checkpoint['state_dict']) log.write(">>>> loaded network:\n>>>> epoch {}\n".format( checkpoint['epoch'])) # moving network to gpu and eval mode model = DataParallel(model) model.cuda() model.eval() # Data loading code dataset = args.dataset if dataset == 'test': steel_test_df = pd.read_csv(opj('..', 'input', 'sample_submission.csv')) elif dataset == 'val': steel_test_df = pd.read_csv( opj(DATA_DIR, args.split_type, args.split_name, f'random_valid_cv{args.fold}.csv')) else: raise ValueError('Unsupported or unknown dataset: {}!'.format(dataset)) steel_test_df['ImageId'], steel_test_df['ClassId'] = zip( *steel_test_df['ImageId_ClassId'].apply(lambda x: x.split('_'))) imageId = pd.DataFrame(steel_test_df['ImageId'].unique(), columns=['ImageId']) test_dataset = SteelDataset( imageId, img_size=args.img_size, mask_size=args.img_size, transform=None, return_label=False, dataset=args.dataset, ) test_loader = DataLoader( test_dataset, sampler=SequentialSampler(test_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) for augment in args.augment: test_loader.dataset.transform = eval('augment_%s' % augment) unaugment_func = eval('unaugment_%s' % augment) sub_submit_out_dir = opj(submit_out_dir, augment) if not ope(sub_submit_out_dir): os.makedirs(sub_submit_out_dir) with torch.no_grad(): predict(test_loader, model, sub_submit_out_dir, dataset, args, unaugment_func=unaugment_func)
class BaseTrainer(object): def __init__(self, args): self.args = args self.set_random_seed(random_seed=args.random_seed) self.tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) if args.debug: print("Debugging mode on.") self.features_lst = self.get_features(self.args.train_folder, self.args.debug) def make_model_env(self, gpu, ngpus_per_node): if self.args.distributed: self.args.gpu = self.args.devices[gpu] else: self.args.gpu = 0 if self.args.use_cuda and self.args.distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes self.args.rank = self.args.rank * ngpus_per_node + gpu dist.init_process_group(backend=self.args.dist_backend, init_method=self.args.dist_url, world_size=self.args.world_size, rank=self.args.rank) # Load baseline model self.model = BertForQuestionAnswering.from_pretrained( self.args.bert_model) if self.args.load_model is not None: print("Loading model from ", self.args.load_model) self.model.load_state_dict( torch.load(self.args.load_model, map_location=lambda storage, loc: storage)) max_len = max([len(f) for f in self.features_lst]) num_train_optimization_steps = math.ceil( max_len / self.args.batch_size) * self.args.epochs * len( self.features_lst) if self.args.freeze_bert: for param in self.model.bert.parameters(): param.requires_grad = False self.optimizer = get_opt(list(self.model.named_parameters()), num_train_optimization_steps, self.args) if self.args.use_cuda: if self.args.distributed: torch.cuda.set_device(self.args.gpu) self.model.cuda(self.args.gpu) self.args.batch_size = int(self.args.batch_size / ngpus_per_node) self.args.workers = int( (self.args.workers + ngpus_per_node - 1) / ngpus_per_node) self.model = DistributedDataParallel( self.model, device_ids=[self.args.gpu], find_unused_parameters=True) else: self.model.cuda() self.model = DataParallel(self.model, device_ids=self.args.devices) cudnn.benchmark = True def make_run_env(self): if self.args.distributed: # distributing dev file evaluation task self.dev_files = [] gpu_num = len(self.args.devices) files = os.listdir(self.args.dev_folder) for i in range(len(files)): if i % gpu_num == self.args.rank: self.dev_files.append(files[i]) print("GPU {}".format(self.args.gpu), self.dev_files) else: self.dev_files = os.listdir(self.args.dev_folder) print(self.dev_files) def get_features(self, train_folder, debug=False): pickled_folder = self.args.pickled_folder + "_{}_{}".format( self.args.bert_model, str(self.args.skip_no_ans)) features_lst = [] files = [f for f in os.listdir(train_folder) if f.endswith(".gz")] print("Number of data set:{}".format(len(files))) for filename in files: data_name = filename.split(".")[0] # Check whether pkl file already exists pickle_file_name = '{}.pkl'.format(data_name) pickle_file_path = os.path.join(pickled_folder, pickle_file_name) if os.path.exists(pickle_file_path): with open(pickle_file_path, 'rb') as pkl_f: print("Loading {} file as pkl...".format(data_name)) features_lst.append(pickle.load(pkl_f)) else: print("processing {} file".format(data_name)) file_path = os.path.join(train_folder, filename) train_examples = read_squad_examples(file_path, debug=debug) train_features = convert_examples_to_features( examples=train_examples, tokenizer=self.tokenizer, max_seq_length=self.args.max_seq_length, max_query_length=self.args.max_query_length, doc_stride=self.args.doc_stride, is_training=True, skip_no_ans=self.args.skip_no_ans) features_lst.append(train_features) # Save feature lst as pickle (For reuse & fast loading) if not debug and self.args.rank == 0: with open(pickle_file_path, 'wb') as pkl_f: print("Saving {} file from pkl file...".format( data_name)) pickle.dump(train_features, pkl_f) return features_lst def get_iter(self, features_lst, args): all_input_ids = [] all_input_mask = [] all_segment_ids = [] all_start_positions = [] all_end_positions = [] all_labels = [] for i, train_features in enumerate(features_lst): all_input_ids.append( torch.tensor([f.input_ids for f in train_features], dtype=torch.long)) all_input_mask.append( torch.tensor([f.input_mask for f in train_features], dtype=torch.long)) all_segment_ids.append( torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)) start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) all_start_positions.append(start_positions) all_end_positions.append(end_positions) all_labels.append(i * torch.ones_like(start_positions)) all_input_ids = torch.cat(all_input_ids, dim=0) all_input_mask = torch.cat(all_input_mask, dim=0) all_segment_ids = torch.cat(all_segment_ids, dim=0) all_start_positions = torch.cat(all_start_positions, dim=0) all_end_positions = torch.cat(all_end_positions, dim=0) all_labels = torch.cat(all_labels, dim=0) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_labels) if args.distributed: train_sampler = DistributedSampler(train_data) data_loader = DataLoader(train_data, num_workers=args.workers, pin_memory=True, sampler=train_sampler, batch_size=args.batch_size) else: weights = make_weights_for_balanced_classes( all_labels.detach().cpu().numpy().tolist(), self.args.num_classes) weights = torch.DoubleTensor(weights) train_sampler = torch.utils.data.sampler.WeightedRandomSampler( weights, len(weights)) data_loader = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=None, sampler=train_sampler, num_workers=args.workers, worker_init_fn=self.set_random_seed(self.args.random_seed), pin_memory=True, drop_last=True) return data_loader, train_sampler def save_model(self, epoch, loss): loss = round(loss, 3) model_type = ("adv" if self.args.adv else "base") save_file = os.path.join( self.args.save_dir, "{}_{}_{:.3f}.pt".format(model_type, epoch, loss)) save_file_config = os.path.join( self.args.save_dir, "{}_config_{}_{:.3f}.json".format(model_type, epoch, loss)) model_to_save = self.model.module if hasattr( self.model, 'module') else self.model # Only save the model it-self torch.save(model_to_save.state_dict(), save_file) model_to_save.config.to_json_file(save_file_config) def train(self, consolidate=True, fisher_estimation_sample_size=1024): step = 1 avg_loss = 0 global_step = 1 iter_lst = [self.get_iter(self.features_lst, self.args)] num_batches = sum([len(iterator[0]) for iterator in iter_lst]) for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): self.model.train() start = time.time() batch_step = 1 for data_loader, sampler in iter_lst: if self.args.distributed: sampler.set_epoch(epoch) for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, _ = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions) loss = loss.mean() loss = loss / self.args.gradient_accumulation_steps ewc_loss = self.model.module.ewc_loss() loss = loss + ewc_loss loss.backward() avg_loss = self.cal_running_avg_loss( loss.item() * self.args.gradient_accumulation_steps, avg_loss) if step % self.args.gradient_accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() if epoch != 0 and i % 2000 == 0: result_dict = self.evaluate_model(i) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") global_step += 1 batch_step += 1 msg = "{}/{} {} - ETA : {} - loss: {:.4f}" \ .format(batch_step, num_batches, progress_bar(batch_step, num_batches), eta(start, batch_step, num_batches), avg_loss) print(msg, end="\r") print("[GPU Num: {}, epoch: {}, Final loss: {:.4f}]".format( self.args.gpu, epoch, avg_loss)) # save model if self.args.rank == 0: self.save_model(epoch, avg_loss) if self.args.do_valid: result_dict = self.evaluate_model(epoch) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") if consolidate: # estimate the fisher information of the parameters and consolidate # them in the network. print( '=> Estimating diagonals of the fisher information matrix...', flush=True, end='', ) # ATTENTION!!! the data_loader should entire training set!!!! self.model.consolidate( self.model.estimate_fisher(data_loader, fisher_estimation_sample_size)) print('EWC Loaded!') def evaluate_model(self, epoch): # result directory result_file = os.path.join(self.args.result_dir, "dev_eval_{}.txt".format(epoch)) fw = open(result_file, "a") result_dict = dict() for dev_file in self.dev_files: file_name = dev_file.split(".")[0] prediction_file = os.path.join( self.args.result_dir, "epoch_{}_{}.json".format(epoch, file_name)) file_path = os.path.join(self.args.dev_folder, dev_file) metrics = eval_qa(self.model, file_path, prediction_file, args=self.args, tokenizer=self.tokenizer, batch_size=self.args.batch_size) f1 = metrics["f1"] fw.write("{} : {}\n".format(file_name, f1)) result_dict[dev_file] = f1 fw.close() return result_dict @staticmethod def cal_running_avg_loss(loss, running_avg_loss, decay=0.99): if running_avg_loss == 0: return loss else: running_avg_loss = running_avg_loss * decay + (1 - decay) * loss return running_avg_loss @staticmethod def set_random_seed(random_seed): if random_seed is not None: print("Set random seed as {}".format(random_seed)) os.environ['PYTHONHASHSEED'] = str(random_seed) random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) torch.cuda.manual_seed_all(random_seed) torch.set_num_threads(1) cudnn.benchmark = False cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.')
def run(): args = get_parser() with open(args.cfg_path) as f: cfg = json.load(f) logger = logging.getLogger("test") logger.setLevel(logging.DEBUG) fileHanlder = logging.FileHandler('test_auxiliary.log') formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fileHanlder.setFormatter(formatter) logger.addHandler(fileHanlder) dataset = ScoringTestDataset(csv_file=args.test_root, root_dir=args.base_root) summary = {'count': 0, 'correct': 0, 'acc': 0} summary_writer = SummaryWriter(comment='TEST_AUXILIARY') model_cls = MODELS[cfg['model']](num_nodes=cfg['grid_size'], use_crf=cfg['use_crf']) model_cls = DataParallel(model_cls, device_ids=None) checkpoint = torch.load(args.load_path_cls) model_cls.load_state_dict(checkpoint['state_dict']) model_cls = model_cls.cuda() model_cls.eval() model_auxiliary = DiscriminatePatch() model_auxiliary = DataParallel(model_auxiliary, device_ids=None) checkpoint = torch.load(args.load_path_auxiliary) model_auxiliary.load_state_dict(checkpoint['state_dict']) model_auxiliary = model_auxiliary.cuda() model_auxiliary.eval() time_now = time.time() y_label = [] y_pred = [] for iteration, (image_list, image_auxiliary_list, score_list, label) in enumerate(dataset): pred_label = [] auxiliary_score = [] batch_num = int(len(image_list) / args.batch_size) remain = int(len(image_list) % args.batch_size) image_cls_score = torch.stack(score_list) for index in range(batch_num): image_list_set = image_list[index * args.batch_size:(index + 1) * args.batch_size] image_set = torch.stack(image_list_set, 0) image_set = image_set.cuda() outputs = model_cls(image_set) probs = outputs.sigmoid() prediction = probs.ge(0.5) pred_label.append(prediction.cpu()) image_auxiliary_list_set = image_auxiliary_list[index * args.batch_size: (index + 1) * args.batch_size] image_auxiliary_set = torch.stack(image_auxiliary_list_set, 0) image_auxiliary_set = image_auxiliary_set.cuda() patch_score = model_auxiliary(image_auxiliary_set) patch_score = patch_score.ge(0.5) auxiliary_score.append((patch_score.cpu())) if remain != 0: image_list_set = image_list[batch_num * args.batch_size:] image_set = torch.stack(image_list_set, 0) image_set = image_set.cuda() outputs = model_cls(image_set) probs = outputs.sigmoid() prediction = probs.ge(0.5) image_auxiliary_list_set = image_auxiliary_list[batch_num * args.batch_size:] image_auxiliary_set = torch.stack(image_auxiliary_list_set, 0) image_auxiliary_set = image_auxiliary_set.cuda() patch_score = model_auxiliary(image_auxiliary_set) patch_score = patch_score.ge(0.5) if remain == 1: prediction = prediction.unsqueeze(0) patch_score = patch_score.unsqueeze(0) pred_label.append(prediction.cpu()) auxiliary_score.append((patch_score.cpu())) pred_cls_label = torch.cat(pred_label, dim=0).float() patch_auxiliary_score = torch.cat(auxiliary_score, dim=0).float() patch_discriminate_num = float(torch.sum(patch_auxiliary_score)) patch_score = torch.mul(pred_cls_label, image_cls_score) score = torch.sum(patch_score, dim=1) score = score.unsqueeze(dim=1) finally_score = torch.sum(torch.mul(score, patch_auxiliary_score)) ratio = float(finally_score) / float(patch_discriminate_num) if ratio >= 0.5: cls_label = torch.ones((1, ), dtype=torch.uint8) else: cls_label = torch.zeros((1, ), dtype=torch.uint8) if torch.equal(cls_label, label): summary['correct'] += 1 logger.info(cls_label) logger.info(label) logger.info('score: {:.4f} / patch_num: {} = {:.4f}'.format( float(finally_score), patch_discriminate_num, ratio)) summary['count'] += 1 summary['acc'] = float(summary['correct']) / float(summary['count']) summary_writer.add_scalar('test/acc', summary['acc'], summary['count']) logger.info( '{}, Numbers of all WSI: {}, Number of the correct WSI classification: {}, ' 'Accuracy: {:.4f}'.format(time.strftime("%Y-%m-%d %H:%M:%S"), summary['count'], summary['correct'], summary['acc'])) y_label.append(label.cpu()) y_pred.append(float(ratio)) y_label_array = numpy.array(y_label) y_pred_array = numpy.array(y_pred) fpr, tpr, threshold = metrics.roc_curve(y_true=y_label_array, y_score=y_pred_array, pos_label=1) auc = metrics.auc(fpr, tpr) logger.info('AUC = {:.4f}'.format(auc)) plt.figure() plt.plot(fpr, tpr, color='darkorange', label='ROC Curve(area = %0.4f' % auc) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show()
class UNetTrainer(object): """UNet trainer""" def __init__(self, start_epoch=0, save_dir='', resume="", devices_num=2, num_classes=2, color_dim=1): self.net = UNet(color_dim=color_dim, num_classes=num_classes) self.start_epoch = start_epoch if start_epoch != 0 else 1 self.save_dir = os.path.join('../models', save_dir) self.loss = CrossEntropyLoss() self.num_classes = num_classes if resume: checkpoint = torch.load(resume) if self.start_epoch == 0: self.start_epoch = checkpoint['epoch'] + 1 if not self.save_dir: self.save_dir = checkpoint['save_dir'] self.net.load_state_dict(checkpoint['state_dir']) if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) self.net.cuda() self.loss.cuda() if devices_num == 2: self.net = DataParallel(self.net, device_ids=[0, 1]) #self.loss = DataParallel(self.loss, device_ids=[0, 1]) def train(self, train_loader, val_loader, lr=0.001, weight_decay=1e-4, epochs=200, save_freq=10): self.logfile = os.path.join(self.save_dir, 'log') sys.stdout = Logger(self.logfile) self.epochs = epochs self.lr = lr optimizer = torch.optim.Adam( self.net.parameters(), #lr, #momentum=0.9, weight_decay=weight_decay) for epoch in range(self.start_epoch, epochs + 1): self.train_(train_loader, epoch, optimizer, save_freq) self.validate_(val_loader, epoch) def train_(self, data_loader, epoch, optimizer, save_freq=10): start_time = time.time() self.net.train() #lr = self.get_lr(epoch) #for param_group in optimizer.param_groups: # param_group['lr'] = lr metrics = [] for i, (data, target) in enumerate(tqdm(data_loader)): data_t, target_t = data, target data = Variable(data.cuda(non_blocking=True)) target = Variable(target.cuda(non_blocking=True)) output = self.net(data) #unet输出结果 output = output.transpose(1, 3).transpose(1, 2).contiguous().view( -1, self.num_classes) target = target.view(-1) loss_output = self.loss(output, target) optimizer.zero_grad() loss_output.backward() #反向传播loss optimizer.step() loss_output = loss_output.data[0] #loss数值 acc = accuracy(output, target) metrics.append([loss_output, acc]) if i == 0: batch_size = data.size(0) _, output = output.data.max(dim=1) output = output.view(batch_size, 1, 1, 320, 480).cpu() #预测结果图 data_t = data_t[0, 0].unsqueeze(0).unsqueeze(0) #原img图 target_t = target_t[0].unsqueeze(0) #gt图 t = torch.cat([output[0].float(), data_t, target_t.float()], 0) #第一个参数为list,拼接3张图像 #show_list = [] #for j in range(10): # show_list.append(data_t[j, 0].unsqueeze(0).unsqueeze(0)) # show_list.append(target_t[j].unsqueeze(0)) # show_list.append(output[j].float()) # #t = torch.cat(show_list, 0) torchvision.utils.save_image(t, "temp_image/%02d_train.jpg" % epoch, nrow=3) #if i == 20: # break if epoch % save_freq == 0: if 'module' in dir(self.net): state_dict = self.net.module.state_dict() else: state_dict = self.net.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save( { 'epoch': epoch, 'save_dir': self.save_dir, 'state_dir': state_dict }, os.path.join(self.save_dir, '%03d.ckpt' % epoch)) end_time = time.time() metrics = np.asarray(metrics, np.float32) self.print_metrics(metrics, 'Train', end_time - start_time, epoch) def validate_(self, data_loader, epoch): start_time = time.time() self.net.eval() metrics = [] for i, (data, target) in enumerate(data_loader): data_t, target_t = data, target data = Variable(data.cuda(non_blocking=True), volatile=True) target = Variable(target.cuda(non_blocking=True), volatile=True) output = self.net(data) output = output.transpose(1, 3).transpose(1, 2).contiguous().view( -1, self.num_classes) target = target.view(-1) loss_output = self.loss(output, target) loss_output = loss_output.data[0] acc = accuracy(output, target) metrics.append([loss_output, acc]) if i == 0: batch_size = data.size(0) _, output = output.data.max(dim=1) output = output.view(batch_size, 1, 1, 320, 480).cpu() data_t = data_t[0, 0].unsqueeze(0).unsqueeze(0) target_t = target_t[0].unsqueeze(0) t = torch.cat([output[0].float(), data_t, target_t.float()], 0) # show_list = [] # for j in range(10): # show_list.append(data_t[j, 0].unsqueeze(0).unsqueeze(0)) # show_list.append(target_t[j].unsqueeze(0)) # show_list.append(output[j].float()) # # t = torch.cat(show_list, 0) torchvision.utils.save_image(t, "temp_image/%02d_val.jpg" % epoch, nrow=3) #if i == 10: # break end_time = time.time() metrics = np.asarray(metrics, np.float32) self.print_metrics(metrics, 'Validation', end_time - start_time) def print_metrics(self, metrics, phase, time, epoch=-1): """metrics: [loss, acc] """ if epoch != -1: print("Epoch: {}".format(epoch), ) print(phase, ) print('loss %2.4f, accuracy %2.4f, time %2.2f' % (np.mean(metrics[:, 0]), np.mean(metrics[:, 1]), time)) if phase != 'Train': print def get_lr(self, epoch): if epoch <= self.epochs * 0.5: lr = self.lr elif epoch <= self.epochs * 0.8: lr = 0.1 * self.lr else: lr = 0.01 * self.lr return lr def save_py_files(self, path): """copy .py files in exps dir, cfgs dir and current dir into save_dir, and keep the files structure """ #exps dir pyfiles = [f for f in os.listdir(path) if f.endswith('.py')] path = "/".join(path.split('/')[-2:]) exp_save_path = os.path.join(self.save_dir, path) mkdir(exp_save_path) for f in pyfiles: shutil.copy(os.path.join(path, f), os.path.join(exp_save_path, f)) #current dir pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(self.save_dir, f)) #cfgs dir shutil.copytree('./cfgs', os.path.join(self.save_dir, 'cfgs'))
def main(): weight_path = config.weights + config.model_name + os.sep + config.description + os.sep + str( config.fold) + os.sep if not os.path.exists(weight_path): os.makedirs(weight_path) log_path = config.logs + config.model_name + os.sep + config.description + os.sep + str( config.fold) + os.sep if not os.path.exists(log_path): os.makedirs(log_path) submit_path = config.submit + config.model_name + os.sep + config.description + os.sep + str( config.fold) + os.sep if not os.path.exists(submit_path): os.makedirs(submit_path) config.write_to_log(log_path + os.sep + 'log.txt') #dataset preparing train_dataset = customDataset(config.train_data, train=True) val_dataset = customDataset(config.test_data, train=True) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=config.batch_size * 2, shuffle=False, pin_memory=False) #model preparing model = get_net(config.num_classes) model = DataParallel(model.cuda(), device_ids=config.gpus) model.train() #optimizer preparing optimizer = optim.Adam(model.parameters(), lr=config.lr, amsgrad=True, weight_decay=config.weight_decay) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) #loss preparing #criterion = nn.CrossEntropyLoss().cuda() criterion = FocalLoss(config.num_classes).cuda() train_loss = AverageMeter() train_top1 = AverageMeter() valid_loss = [np.inf, 0, 0] best_precision = 0 for epoch in range(config.epochs): scheduler.step(epoch) train_progressor = ProgressBar(log_path, mode="Train", epoch=epoch, total_epoch=config.epochs, model_name=config.model_name, total=len(train_loader)) for index, (data, label) in enumerate(train_loader): train_progressor.current = index data = Variable(data).cuda() label = Variable(torch.from_numpy(np.asarray(label))).cuda() optimizer.zero_grad() output = model(data) loss = criterion(output, label) loss.backward() optimizer.step() precision1_train, precision2_train = accuracy(output, label, topk=(1, 2)) train_loss.update(loss.item(), data.size(0)) train_top1.update(precision1_train[0], data.size(0)) train_progressor.current_loss = train_loss.avg train_progressor.current_top1 = train_top1.avg train_progressor() #print('train epoch %d iteration %d: loss: %.3f' % (epoch + 1, index + 1, loss.data)) train_progressor.done() val_loss, val_top1 = evaluate(epoch, model, val_loader, criterion, log_path) is_best = val_top1 > best_precision #print(bool(is_best)) best_precision = max(val_top1, best_precision) save_checkpoint( { "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_precision1": best_precision, "optimizer": optimizer.state_dict(), "fold": config.fold, "valid_loss": valid_loss, }, is_best, weight_path, log_path, epoch)
def run(): args = get_parser() with open(args.cfg_path) as f: cfg = json.load(f) logger = logging.getLogger("test") logger.setLevel(logging.DEBUG) fileHanlder = logging.FileHandler('test_lr_auc.log') formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fileHanlder.setFormatter(formatter) logger.addHandler(fileHanlder) dataset = ImageTestDataset(csv_file=args.test_root, root_dir=args.base_root) model_cnn = MODELS[cfg['model']](num_nodes=cfg['grid_size'], use_crf=cfg['use_crf']) model_cnn = DataParallel(model_cnn, device_ids=None) checkpoint_cnn = torch.load(args.load_path_cnn) model_cnn.load_state_dict(checkpoint_cnn['state_dict']) model_cnn = model_cnn.cuda() model_cnn.eval() model_lr = LogistRegression() model_lr = DataParallel(model_lr, device_ids=None) checkpoint_lr = torch.load(args.load_path_lr) model_lr.load_state_dict(checkpoint_lr['state_dict']) model_lr.cuda() model_lr.eval() summary = {'count': 0, 'correct': 0, 'acc': 0} y_pred = [] y_label = [] for iteration, (image_list, label) in enumerate(dataset): time_now = time.time() pred_label = [] label = label.cuda() batch_num = int(len(image_list) / args.batch_size) remain = int(len(image_list) % args.batch_size) for index in range(batch_num): image_list_set = image_list[index * args.batch_size:(index + 1) * args.batch_size] image_set = torch.stack(image_list_set, 0) image_set = image_set.cuda() outputs = model_cnn(image_set) probs = outputs.sigmoid() prediction = probs.ge(0.5) pred_label.append(prediction.cpu()) if remain != 0: image_list_set = image_list[batch_num * args.batch_size:] image_set = torch.stack(image_list_set, 0) image_set = image_set.cuda() outputs = model_cnn(image_set) probs = outputs.sigmoid() prediction = probs.ge(0.5) if remain == 1: prediction = prediction.unsqueeze(0) pred_label.append(prediction.cpu()) pred_cls_label = torch.cat(pred_label, dim=0) grid_num_sum = pred_cls_label.size()[0] * pred_cls_label.size()[1] score = torch.sum(pred_cls_label) number_A = int(score) number_B = grid_num_sum - number_A A = torch.full((1, ), number_A) B = torch.full((1, ), number_B) histogram = torch.cat((A, B), dim=0) output = model_lr(histogram) pred_cls = output.ge(0.5) summary['count'] += 1 if torch.equal(pred_cls, label): summary['correct'] += 1 time_spent = time.time() - time_now summary['acc'] = float(summary['correct']) / float(summary['count']) logger.info( '{}, Numbers of all WSI: {}, Number of the correct WSI classification: {}, ' 'Accuracy: {:.4f}, Running time: {:.4f}'.format( time.strftime("%Y-%m-%d %H:%M:%S"), summary['count'], summary['correct'], summary['acc'], time_spent)) y_pred.append(float(output.cpu())) y_label.append(label.cpu()) y_pred_array = numpy.array(y_pred) y_label_array = numpy.array(y_label) fpr, tpr, threshold = metrics.roc_curve(y_true=y_label_array, y_score=y_pred_array, pos_label=1) auc = metrics.auc(fpr, tpr) logger.info('AUC = {:.4f}'.format(auc)) plt.figure() plt.plot(fpr, tpr, color='darkorange', label='ROC Curve(area = %0.4f' % auc) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show()
def trainer(self, train_path, val_path, cur_fold, output_dir=None, log_dir=None, optimizer='Adam', loss_fun='Cross_Entropy', class_weight=None, lr_scheduler=None): torch.manual_seed(1000) np.random.seed(1000) torch.cuda.manual_seed_all(1000) print('Device:{}'.format(self.device)) torch.backends.cudnn.deterministic = True torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True output_dir = os.path.join(output_dir, "fold" + str(cur_fold)) log_dir = os.path.join(log_dir, "fold" + str(cur_fold)) if os.path.exists(log_dir): if not self.pre_trained: shutil.rmtree(log_dir) os.makedirs(log_dir) else: os.makedirs(log_dir) if os.path.exists(output_dir): if not self.pre_trained: shutil.rmtree(output_dir) os.makedirs(output_dir) else: os.makedirs(output_dir) self.step_pre_epoch = len(train_path) // self.batch_size self.writer = SummaryWriter(log_dir) self.global_step = self.start_epoch * math.ceil( len(train_path[0]) / self.batch_size) net = self.net # only for deeplab if self.freeze is not None and 'deeplab' in self.net_name: if self.freeze == 'backbone': net.freeze_backbone() elif self.freeze == 'classifier': net.freeze_classifier() lr = self.lr loss = self._get_loss(loss_fun, class_weight) if len(self.device.split(',')) > 1: net = DataParallel(net) # dataloader setting if self.mode == 'cls': train_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), RandomEraseHalf(scale_flag=False), RandomDistortHalf(), RandomTranslationRotationZoomHalf(num_class=self.num_classes), RandomFlipHalf(mode='hv'), RandomAdjustHalf(), To_Tensor(num_class=self.num_classes) ]) else: train_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), RandomEraseHalf(scale_flag=False), RandomDistortHalf(), RandomTranslationRotationZoomHalf(num_class=self.num_classes), # RandomFlipHalf(mode='hv'), # RandomAdjustHalf(), RandomNoiseHalf(), To_Tensor(num_class=self.num_classes) ]) train_dataset = DataGenerator(train_path, roi_number=self.roi_number, num_class=self.num_classes, transform=train_transformer, seq_len=self.seq_len) train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers, pin_memory=True) # copy to gpu net = net.cuda() loss = loss.cuda() # optimizer setting optimizer = self._get_optimizer(optimizer, net, lr) if self.pre_trained and self.ckpt_point: checkpoint = torch.load(self.weight_path) optimizer.load_state_dict(checkpoint['optimizer']) if lr_scheduler is not None: lr_scheduler = self._get_lr_scheduler(lr_scheduler, optimizer) # loss_threshold = 1.0 early_stopping = EarlyStopping(patience=20,verbose=True,monitor='val_loss',op_type='min') for epoch in range(self.start_epoch, self.n_epoch): train_loss, train_dice, train_acc = self._train_on_epoch(epoch, net, loss, optimizer, train_loader) val_loss, val_dice, val_acc = self._val_on_epoch(epoch, net, loss, val_path) if lr_scheduler is not None: lr_scheduler.step(val_loss) torch.cuda.empty_cache() print('epoch:{},train_loss:{:.5f},val_loss:{:.5f}'.format(epoch, train_loss, val_loss)) print('epoch:{},train_dice:{:.5f},val_dice:{:.5f}'.format(epoch, train_dice, val_dice)) self.writer.add_scalars('data/loss', { 'train': train_loss, 'val': val_loss }, epoch) self.writer.add_scalars('data/dice', { 'train': train_dice, 'val': val_dice }, epoch) self.writer.add_scalars('data/acc', { 'train': train_acc, 'val': val_acc }, epoch) self.writer.add_scalar('data/lr', optimizer.param_groups[0]['lr'],epoch) early_stopping(val_loss) #save if val_loss <= self.loss_threshold: self.loss_threshold = val_loss if len(self.device.split(',')) > 1: state_dict = net.module.state_dict() else: state_dict = net.state_dict() saver = { 'epoch': epoch, 'save_dir': output_dir, 'state_dict': state_dict, 'optimizer': optimizer.state_dict() } file_name = 'epoch:{}-train_loss:{:.5f}-train_dice:{:.5f}-train_acc:{:.5f}-val_loss:{:.5f}-val_dice:{:.5f}-val_acc:{:.5f}.pth'.format( epoch, train_loss, train_dice, train_acc, val_loss, val_dice, val_acc) save_path = os.path.join(output_dir, file_name) print("Save as %s" % file_name) torch.save(saver, save_path) if early_stopping.early_stop: print('Early Stopping!') break self.writer.close()
def train(args): print('start training...') model, model_file = create_model(args) #model = model.cuda() if torch.cuda.device_count() > 1: model_name = model.name model = DataParallel(model) model.name = model_name model = model.cuda() if args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0001) if args.lrs == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=args.factor, patience=args.patience, min_lr=args.min_lr) else: lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr) #ExponentialLR(optimizer, 0.9, last_epoch=-1) #CosineAnnealingLR(optimizer, 15, 1e-7) if args.balanced: _, val_loader = get_balanced_train_val_loaders( num_classes=args.num_classes, start_index=args.start_index, batch_size=args.batch_size, val_batch_size=args.val_batch_size, val_num=args.val_num, other=args.other) else: _, val_loader = get_train_val_loaders( num_classes=args.num_classes, start_index=args.start_index, batch_size=args.batch_size, val_batch_size=args.val_batch_size, val_num=args.val_num, other=args.other) best_top1_acc = 0. print( 'epoch | lr | % | loss | avg | loss | top1 | top10 | best | time | save |' ) if not args.no_first_val: top10_acc, best_top1_acc, total_loss = validate( args, model, val_loader) print( 'val | | | | | {:.4f} | {:.4f} | {:.4f} | {:.4f} | | |' .format(total_loss, best_top1_acc, top10_acc, best_top1_acc)) if args.val: return model.train() if args.lrs == 'plateau': lr_scheduler.step(best_top1_acc) else: lr_scheduler.step() train_iter = 0 for epoch in range(args.start_epoch, args.epochs): if args.balanced: train_loader, val_loader = get_balanced_train_val_loaders( num_classes=args.num_classes, start_index=args.start_index, batch_size=args.batch_size, dev_mode=args.dev_mode, val_batch_size=args.val_batch_size, val_num=args.val_num, other=args.other) else: train_loader, val_loader = get_train_val_loaders( num_classes=args.num_classes, start_index=args.start_index, batch_size=args.batch_size, dev_mode=args.dev_mode, val_batch_size=args.val_batch_size, val_num=args.val_num, other=args.other) train_loss = 0 current_lr = get_lrs( optimizer) #optimizer.state_dict()['param_groups'][2]['lr'] bg = time.time() for batch_idx, data in enumerate(train_loader): train_iter += 1 img, target = data img, target = img.cuda(), target.cuda() loss = model(img, target).sum() / img.size(0) #loss = criterion(args, output, target) #(img.size(0) * loss).backward() loss.backward() optimizer.step() optimizer.zero_grad() train_loss += loss.item() print('\r {:4d} | {:.6f} | {:06d}/{} | {:.4f} | {:.4f} |'.format( epoch, float(current_lr[0]), args.batch_size * (batch_idx + 1), train_loader.num, loss.item(), train_loss / (batch_idx + 1)), end='') if train_iter > 0 and train_iter % args.iter_val == 0: if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file + '_latest') else: torch.save(model.state_dict(), model_file + '_latest') top10_acc, top1_acc, total_loss = validate( args, model, val_loader) _save_ckp = '' if args.always_save or top1_acc > best_top1_acc: best_top1_acc = top1_acc if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file) else: torch.save(model.state_dict(), model_file) _save_ckp = '*' print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'. format(total_loss, top1_acc, top10_acc, best_top1_acc, (time.time() - bg) / 60, _save_ckp)) model.train() if args.lrs == 'plateau': lr_scheduler.step(top1_acc) else: lr_scheduler.step() current_lr = get_lrs(optimizer)
# 1. torch.Size([8, 1, 38, 252]) # 2. torch.Size([8, 50, 34, 228]) # 3. torch.Size([8, 50, 34, 76]) # 4. torch.Size([8, 50, 32, 72]) # 5. torch.Size([8, 50, 32, 24]) # 6. torch.Size([8, 38400]) # 7. torch.Size([8, 19200]) # 8. torch.Size([8, 9600]) # In[7]: net, loss = get_model() net = DataParallel(net) net = net.cuda() loss = loss.cuda() # In[29]: data_dir = 'someprocesseddata' dataset = data_loader(data_dir, win_width, kernel_size, overlap=True, phase='train') train_loader = DataLoader( dataset, batch_size=batch_size, shuffle=True, num_workers=n_workers,
def run(config): from datasets import myDataset config, model, loss, warp, trainer, train_data, val_data, train_loader, val_loader = prepare( config) # print(model) # data, gt_prob_fpn, gt_coord_prob_fpn, gt_coord_diff_fpn, gt_diff_fpn, gt_connects_fpn, self.cases[idx] = train_data[0] # print(data.shape) # exit() if config.test: print('Start testing') #if hasattr(model, 'test'): # model.forward = model.test model = DataParallel(model.cuda()) tester = Tester(model, config) val_data = myDataset(config, 'test') test_loader = DataLoader(val_data, batch_size=1, shuffle=False, num_workers=3, pin_memory=True, collate_fn=lambda x: x) tester.test(test_loader) return elif config.val: print('Start Val') start_epoch = config.train['start_epoch'] trainer.validate(start_epoch, val_loader, save=True) else: start_epoch = config.train['start_epoch'] epoch = config.train["epoch"] print('Start training from %d-th epoch' % start_epoch) epoch2loss = {} for i in range(start_epoch, epoch + 1): try: # no hardming if 'hardmining' in config.prepare and config.prepare[ 'hardmining']: train_loader.dataset.resample3() json.dump( [str(item) for item in train_loader.dataset.samples], open( os.path.join(trainer.save_dir, 'sample_%d.json' % (i)), 'w'), indent=2) json.dump( { k: str(v) for k, v in train_loader.dataset.sample_weights.items() }, open( os.path.join(trainer.save_dir, 'sample_weights_%d.json' % (i)), 'w'), indent=2) #json.dump({k: str(v) for k, v in train_loader.dataset.neg_sample_weights.items()}, open(os.path.join(trainer.save_dir, 'neg_sample_weights_%d.json'%(i)), 'w'), indent=2) loss_list = trainer.train(i, train_loader) epoch2loss[i] = list(loss_list) trainer.validate(i, val_loader) except KeyboardInterrupt as e: traceback.print_exc() trainer.ioer.save_file(trainer.net, i, trainer.args, 1e10, isbreak=True) sys.exit(0) print(epoch2loss) with open('./epoch_loss.json', 'w') as f: f.write(json.dumps(epoch2loss))
def run(args): with open(args.cnn_path, 'r') as f: cnn = json.load(f) if not os.path.exists(args.save_path): os.mkdir(args.save_path) with open(os.path.join(args.save_path, 'cnn.json'), 'w') as f: json.dump(cnn, f, indent=1) os.environ["CUDA_VISIBLE_DEVICES"] = args.device_ids num_GPU = len(args.device_ids.split(',')) batch_size_train = cnn['batch_size'] * num_GPU batch_size_valid = cnn['batch_size'] * num_GPU num_workers = args.num_workers * num_GPU model = chose_model(cnn) fc_features = model.fc.in_features model.fc = nn.Linear(fc_features, 1) # 须知 model = DataParallel(model, device_ids=None) model = model.cuda() loss_fn = BCEWithLogitsLoss().cuda() optimizer = SGD(model.parameters(), lr=cnn['lr'], momentum=cnn['momentum']) # dataset_train = ImageFolder(cnn['data_path_train']) # dataset_valid = ImageFolder(cnn['data_path_valid']) dataset_train = ImageDataset(cnn['data_path_train'], cnn['image_size'], cnn['crop_size'], cnn['normalize']) dataset_valid = ImageDataset(cnn['data_path_valid'], cnn['image_size'], cnn['crop_size'], cnn['normalize']) dataloader_train = DataLoader(dataset_train, batch_size=batch_size_train, num_workers=num_workers) dataloader_valid = DataLoader(dataset_valid, batch_size=batch_size_valid, num_workers=num_workers) summary_train = {'epoch': 0, 'step': 0} summary_valid = {'loss': float('inf'), 'acc': 0} summary_writer = SummaryWriter(args.save_path) loss_valid_best = float('inf') for epoch in range(cnn['epoch']): summary_train = train_epoch(summary_train, summary_writer, cnn, model, loss_fn, optimizer, dataloader_train) torch.save({'epoch': summary_train['epoch'], 'step': summary_train['step'], 'state_dict': model.module.state_dict()}, os.path.join(args.save_path, 'train.ckpt')) time_now = time.time() summary_valid = valid_epoch(summary_valid, model, loss_fn, dataloader_valid) time_spent = time.time() - time_now logging.info('{}, Epoch: {}, step: {}, Validation Loss: {:.5f}, ' 'Validation ACC: {:.3f}, Run Time: {:.2f}' .format(time.strftime("%Y-%m-%d %H:%M:%S"), summary_train['epoch'], summary_train['step'], summary_valid['loss'], summary_valid['acc'], time_spent)) summary_writer.add_scalar('valid/loss', summary_valid['loss'], summary_train['step']) summary_writer.add_scalar('valid/acc', summary_valid['acc'], summary_train['step']) if summary_valid['loss'] < loss_valid_best: loss_valid_best = summary_valid['loss'] torch.save({'epoch': summary_train['epoch'], 'step': summary_train['step'], 'state_dict': model.module.state_dict()}, os.path.join(args.save_path, 'best.ckpt')) summary_writer.close()