def _make_loader(self, batch_size=1): """ 读入数据集的根目录,摘取出训练集和测试集对应的文件路径 :return: """ input_root = self.input_root ground_root = self.ground_root path_dict = { "img": input_root, "label": ground_root, } input_data_paths = [path for path in os.listdir(input_root)] training_set = [input_data_paths] training_len = math.ceil(self.k_fold * len(input_data_paths)) self.joint_shuffle(training_set) train_img_paths = training_set[0][:training_len] test_img_paths = training_set[0][training_len:] train_dataset = KaggleDataset(path_dict, train_img_paths, ground_root) test_dataset = KaggleDataset(path_dict, test_img_paths, ground_root) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size) return train_loader, test_loader
def prepare_dataset(args, category=None): if args.mode == 'train': train_csv = re.sub('\.csv$', '', args.in_train_csv) dataset_train = KaggleDataset(train_csv, transform=Compose(), img_folder=args.in_train_img, category=category, resize_scale=[128, 128]) return dataset_train if args.mode == 'valid': valid_csv = re.sub('\.csv$', '', args.in_valid_csv) dataset_valid = KaggleDataset(valid_csv, transform=Compose(), img_folder=args.in_valid_img, category=category, resize_scale=[128, 128]) return dataset_valid if args.mode == 'test': test_csv = re.sub('\.csv$', '', args.in_test_csv) dataset_test = KaggleDataset(test_csv, transform=Compose(), img_folder=args.in_test_img, category=category, resize_scale=[128, 128]) return dataset_test
def main(tocsv=False, save=False, mask=False, valid_train=False, toiou=False): model_name = config['param']['model'] resize = not config['valid'].getboolean('pred_orig_size') if model_name == 'unet_vgg16': model = UNetVgg16(3, 1, fixed_vgg=True) elif model_name == 'dcan': model = DCAN(3, 1) elif model_name == 'caunet': model = CAUNet() elif model_name == 'camunet': model = CAMUNet() else: model = UNet() if torch.cuda.is_available(): model = model.cuda() # model = torch.nn.DataParallel(model).cuda() # Sets the model in evaluation mode. model.eval() epoch = load_ckpt(model) if epoch == 0: print("Aborted: checkpoint not found!") return # prepare dataset compose = Compose(augment=False, resize=resize) data_dir = 'data/stage1_train' if valid_train else 'data/stage1_test' dataset = KaggleDataset(data_dir, transform=compose) iter = predict(model, dataset, compose, resize) if tocsv: with open('result.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'EncodedPixels']) for uid, _, y, y_c, y_m, _, _, _, _ in iter: for rle in prob_to_rles(y, y_c, y_m): writer.writerow([uid, ' '.join([str(i) for i in rle])]) elif toiou and valid_train: with open('iou.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'IoU']) for uid, _, y, y_c, y_m, gt, _, _, _ in tqdm(iter): iou = get_iou(y, y_c, y_m, gt) writer.writerow([uid, iou]) else: for uid, x, y, y_c, y_m, gt, gt_s, gt_c, gt_m in tqdm(iter): if valid_train: show_groundtruth(uid, x, y, y_c, y_m, gt, gt_s, gt_c, gt_m, save) elif mask: save_mask(uid, y, y_c, y_m) else: show(uid, x, y, y_c, y_m, save)
def main(resume=True, n_epoch=None, learn_rate=None): model_name = config['param']['model'] if learn_rate is None: learn_rate = config['param'].getfloat('learn_rate') width = config.getint(model_name, 'width') weight_map = config['param'].getboolean('weight_map') c = config['train'] log_name = c.get('log_name') n_batch = c.getint('n_batch') n_worker = c.getint('n_worker') n_cv_epoch = c.getint('n_cv_epoch') if n_epoch is None: n_epoch = c.getint('n_epoch') balance_group = c.getboolean('balance_group') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = build_model(model_name) model = model.to(device) # define optimizer optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.learn_rate, weight_decay=1e-6 ) # dataloader workers are forked process thus we need a IPC manager to keep cache in same memory space manager = Manager() cache = manager.dict() compose = Compose() # prepare dataset if os.path.exists('data/valid'): # advance mode: use valid folder as CV train_dataset = KaggleDataset('data/train', transform=compose, cache=cache) valid_dataset = KaggleDataset('data/valid', transform=compose, cache=cache) else: # auto mode: split part of train dataset as CV train_dataset = KaggleDataset('data/train', transform=compose, cache=cache, use_filter=True) train_dataset, valid_dataset = train_dataset.split() # decide whether to balance training set if balance_group: weights, ratio = train_dataset.class_weight() # Len of weights is number of original epoch samples. # After oversample balance, majority class will be under-sampled (least sampled) # Multipling raito is to gain chance for each sample to be visited at least once in each epoch sampler = WeightedRandomSampler(weights, int(len(weights) * ratio)) else: sampler = RandomSampler(train_dataset) # data loader train_loader = DataLoader( train_dataset, sampler=sampler, batch_size=n_batch, num_workers=n_worker, pin_memory=torch.cuda.is_available()) valid_loader = DataLoader( valid_dataset, shuffle=False, batch_size=n_batch, num_workers=n_worker) # resume checkpoint start_epoch = iou_tr = iou_cv = 0 if resume: start_epoch = load_ckpt(model, optimizer) if start_epoch == 0: print('Grand new training ...') # put model to GPU if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) # decide log directory name log_dir = os.path.join( 'logs', log_name, '{}-{}'.format(model_name, width), 'ep_{},{}-lr_{}'.format( start_epoch, n_epoch + start_epoch, learn_rate, ) ) with SummaryWriter(log_dir) as writer: if start_epoch == 0 and False: # dump graph only for very first training, disable by default dump_graph(model, writer, n_batch, width) print('Training started...') for epoch in range(start_epoch + 1, n_epoch + start_epoch + 1): # 1 base iou_tr = train(train_loader, model, optimizer, epoch, writer) if len(valid_dataset) > 0 and epoch % n_cv_epoch == 0: with torch.no_grad(): iou_cv = valid(valid_loader, model, epoch, writer, len(train_loader)) save_ckpt(model, optimizer, epoch, iou_tr, iou_cv) print('Training finished...')
def run_submit(args): augment = ['null'] out_dir = args.out_dir + f'/{args.model_name}' initial_checkpoint = args.initial_checkpoint batch_size = args.batch_size ## setup out_dir os.makedirs(out_dir +'/submit', exist_ok=True) log = Logger() log.open(out_dir+'/log.submit.txt',mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') log.write('submitting .... @ %s\n'%str(augment)) log.write('initial_checkpoint = %s\n'%initial_checkpoint) log.write('\n') if 1: #save log.write('** dataset setting **\n') files_train = [f'train_image_data_{fid}.feather' for fid in range(4)] data = read_data(args.data_dir, files_train) df = pd.read_csv(args.df_path) valid_split = np.load(args.data_dir + '/valid_b_fold1_15985.npy').tolist() valid_df = df[df['image_id'].isin(valid_split)] test_dataset = KaggleDataset( df = df, data = data, idx = valid_df.index.values, augment = valid_augment, ) log.write('\n') ## net log.write('** net setting **\n') if args.model_name == 'serex50': net = Serex50_Net().cuda() elif args.model_name == 'effnetb3': net = EfficientNet_3().cuda() else: raise NotImplemented net.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage), strict=True) image_id, truth, probability = do_evaluate(net, test_dataset, batch_size, augment) if 1: #save write_list_to_file (out_dir + '/submit/image_id.txt',image_id) write_pickle_to_file(out_dir + '/submit/probability.pickle', probability) write_pickle_to_file(out_dir + '/submit/truth.pickle', truth) if 1: image_id = read_list_from_file(out_dir + '/submit/image_id.txt') probability = read_pickle_from_file(out_dir + '/submit/probability.pickle') truth = read_pickle_from_file(out_dir + '/submit/truth.pickle') num_test= len(image_id) if 1: recall, avgerage_recall = compute_kaggle_metric(probability, truth) log.write('avgerage_recall : %f\n'%(avgerage_recall)) for i,name in enumerate(TASK_NAME): log.write('%28s %f\n'%(name,recall[i])) log.write('\n')
def main(tocsv=False, save=False, mask=False, valid_train=False, toiou=False, submit_folder=False): model_name = config['param']['model'] resize = not config['valid'].getboolean('pred_orig_size') if model_name == 'unet_vgg16': model = UNetVgg16(3, 1, fixed_vgg=True) elif model_name == 'dcan': model = DCAN(3, 1) elif model_name == 'caunet': model = CAUNet() elif model_name == 'camunet': model = CAMUNet() else: model = UNet() if torch.cuda.is_available(): model = model.cuda() # model = torch.nn.DataParallel(model).cuda() # Sets the model in evaluation mode. model.eval() epoch = load_ckpt(model) if epoch == 0: print("Aborted: checkpoint not found!") return # prepare dataset compose = Compose(augment=False, resize=resize) #data_dir = 'data/stage1_train' if valid_train else 'data/stage1_test' #data_dir = 'data/stage1_train' if valid_train else '../bowl_classifier/stage2_test' data_dir = 'data/stage1_train' if valid_train else config['param'][ 'CSV_PATH'] print(data_dir) data_dir = re.sub('\.csv', '', data_dir) dataset = KaggleDataset(data_dir, transform=compose, img_folder=config['param']['img_folder']) iter = predict(model, dataset, compose, resize) if tocsv: if valid_train: print('Saving %s/train_result.csv... Done!' % submit_folder) with open('%s/train_result.csv' % submit_folder, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'EncodedPixels']) for uid, _, y, y_c, y_m, _, _, _, _ in iter: for rle in prob_to_rles(y, y_c, y_m): writer.writerow([uid, ' '.join([str(i) for i in rle])]) else: print('Saving %s/test_result.csv... Done!' % submit_folder) with open('%s/test_result.csv' % submit_folder, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'EncodedPixels']) for uid, _, y, y_c, y_m, _, _, _, _ in iter: for rle in prob_to_rles(y, y_c, y_m): writer.writerow([uid, ' '.join([str(i) for i in rle])]) elif toiou and valid_train: print('Saving %s/iou_train.csv...Done!' % submit_folder) with open('%s/iou_train.csv' % submit_folder, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'IoU']) for uid, _, y, y_c, y_m, gt, _, _, _ in tqdm(iter): iou = get_iou(y, y_c, y_m, gt) writer.writerow([uid, iou]) else: for uid, x, y, y_c, y_m, gt, gt_s, gt_c, gt_m in tqdm(iter): if valid_train: show_groundtruth(uid, x, y, y_c, y_m, gt, gt_s, gt_c, gt_m, save) elif mask: save_mask(uid, y, y_c, y_m) else: show(uid, x, y, y_c, y_m, save) if valid_train: data_dir = 'data/stage1_valid' if not os.path.exists(data_dir): print( '%s does not exist. It will not generate %s/iou_valid.csv\nBye bye!' % (data_dir, submit_folder)) else: dataset = KaggleDataset(data_dir, transform=compose) iter = predict(model, dataset, compose, resize) if toiou and valid_train: print('Saving %s/iou_valid.csv... Done!' % submit_folder) with open('%s/iou_valid.csv' % submit_folder, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'IoU']) for uid, _, y, y_c, y_m, gt, _, _, _ in tqdm(iter): iou = get_iou(y, y_c, y_m, gt) writer.writerow([uid, iou])
if torch.cuda.is_available(): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = gpu device = torch.device("cuda", int(gpu)) pos_weights = pos_weights.to(device) if not os.path.exists(csv_output): os.mkdir(csv_output) # TODO: 替换为真正的变量后使用 model = RecognizeModel() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) writer = tensorboard.SummaryWriter(training_log_dir) picker = DataPicker(path_dict["img"], path_dict["label"], k_fold) loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights) Dataset = KaggleDataset(path_dict) DataLoader = None iter_test_vals = [] iter_test_accuracy = [] model.to(device) if not os.path.exists(model_output): os.mkdir(model_output) for iter in range(iteration): train_loader, test_loader = picker.get_loader(batch_size=10) temp_test_vals = [] temp_test_accuracy = [] start = 0
def main(args): wandb.init(project="kaggle_cassava_leaf_disease_classification") wandb.run.name = args.config config = importlib.import_module(f"stage1.{args.config}") wandb.save(f"configs/{args.config}.py") os.makedirs(f"./result/{args.config}", exist_ok=True) config.fold_num = args.fold_num print(config.fold_num) if config.use_prev_data: df = pd.read_csv("./data/split_df.csv") # # df = df_[~df_.image_id.isin(invalid_ids)].copy() # df_20 = df.loc[(df["source"] == 2020)].copy().reset_index(drop=True) # df_20["data_dir"] = "train_images" df_20 = df.loc[(df["source"] == 2020)].copy().reset_index(drop=True) df_20["data_dir"] = "train_images" df_19_0 = df.loc[(df["source"] == 2019) & (df["label"] == 0)].copy().reset_index(drop=True) df_19_0["data_dir"] = "train/cbb/" df_19_2 = df.loc[(df["source"] == 2019) & (df["label"] == 2)].copy().reset_index(drop=True) df_19_2["data_dir"] = "train/cgm/" df_19_4 = df.loc[(df["source"] == 2019) & (df["label"] == 4)].copy().reset_index(drop=True) df_19_4["data_dir"] = "train/healthy/" df = pd.concat([df_20, df_19_0, df_19_2, df_19_4], axis=0).reset_index(drop=True) # df = pd.concat([df_20, df_19_0, df_19_2, df_19_4], axis=0).reset_index(drop=True) else: df = pd.read_csv("./data/train.csv") # df = df_[~df_.image_id.isin(invalid_ids)].copy() df["kfold"] = -1 df = df.sample(frac=1).reset_index(drop=True) y = df["label"].values skf = StratifiedKFold(n_splits=5) for (fold_num), (train_index, val_index) in enumerate(skf.split(X=df, y=y)): df.loc[df.iloc[val_index].index, "kfold"] = fold_num train_df = df.loc[df["kfold"] != args.fold_num].reset_index( drop=True).copy() valid_df = df.loc[df["kfold"] == args.fold_num].reset_index( drop=True).copy() sampler = None if config.upsampling: target = train_df.label class_sample_count = np.unique(target, return_counts=True)[1] class_sample_count[0] *= 1 class_sample_count[1] *= 1 class_sample_count[2] *= 1 class_sample_count[3] *= 0.7 class_sample_count[4] *= 1 weight = 1. / class_sample_count samples_weight = weight[target] samples_weight = torch.from_numpy(samples_weight) sampler = torch.utils.data.WeightedRandomSampler( samples_weight, len(samples_weight)) print("finish data setting") print(train_df.head()) print(valid_df.head()) train_dataset = KaggleDataset( df=train_df, transforms=config.train_transforms, preprocessing=config.preprocessing, mode="train", ) validation_dataset = KaggleDataset( df=valid_df, transforms=config.valid_transforms, preprocessing=config.preprocessing, mode="val", ) train_loader = DataLoader( train_dataset, sampler=sampler, batch_size=config.batch_size, pin_memory=True, num_workers=4, ) valid_loader = DataLoader( validation_dataset, batch_size=config.batch_size, pin_memory=True, num_workers=4, ) print("model setting") if config.resume_dir is None: if "efficientnet" in config.net_type: net = MODEL_LIST["effcientnet"](net_type=config.net_type, pretrained=True, bn=config.bn) elif "vit" in config.net_type: net = MODEL_LIST["vit"](net_type=config.net_type, pretrained=True) elif "res" in config.net_type: net = MODEL_LIST["resnet"](net_type=config.net_type, pretrained=True) elif "hrnet" in config.net_type: net = net = MODEL_LIST["hrnet"](net_type=config.net_type, pretrained=True) else: net = MODEL_LIST["pretrained_enet"]( net_type=config.net_type, pretrained_path= f"./result/{config.resume_dir}/{config.resume_dir}_fold_{config.fold_num}/{config.resume_dir}_fold_{config.fold_num}_last-checkpoint.bin" ) # if torch.cuda.device_count() > 1: # net = torch.nn.DataParallel(net, device_ids=[0,1,2,3]) # config.lr = config.lr * torch.cuda.device_count() net = net.to(device) wandb.watch(net, log="all") runner = PyTorchTrainer(model=net, device=device, config=config, fold_num=args.fold_num) if config.resume: print("load model") runner.load( f"./result/{config.dir}/{config.dir}_fold_{config.fold_num}/{config.dir}_fold_{config.fold_num}_last-checkpoint.bin" ) runner.fit(train_loader=train_loader, validation_loader=valid_loader)
def main(ckpt, tocsv=False, save=False, mask=False, target='test', toiou=False): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # load one or more checkpoint models = [] for fn in ckpt or [None]: # load model model = load_ckpt(filepath=fn) if not model: print("Aborted: checkpoint {} not found!".format(fn)) return # Sets the model in evaluation mode. model.eval() # put model to GPU if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model = model.to(device) # append to model list models.append(model) resize = not config['valid'].getboolean('pred_orig_size') compose = Compose(augment=False, resize=resize) # decide which dataset to pick sample data_dir = os.path.join('data', target) if target == 'test': dataset = KaggleDataset(data_dir, transform=compose) elif os.path.exists('data/valid'): # advance mode: use valid folder as CV dataset = KaggleDataset(data_dir, transform=compose) else: # auto mode: split part of train dataset as CV dataset = KaggleDataset('data/train', transform=compose, use_filter=True) if target == 'train': dataset, _ = dataset.split() elif target == 'valid': _, dataset = dataset.split() # iterate dataset and inference each sample ious = [] writer = csvfile = None for data in tqdm(dataset): with torch.no_grad(): uid, y, y_c, y_m = inference(data, models, resize) x, gt, gt_s, gt_c, gt_m = unpack_data(data, compose, resize) if tocsv: if writer is None: csvfile = open('result.csv', 'w') writer = csv.writer(csvfile) writer.writerow(['ImageId', 'EncodedPixels']) for rle in prob_to_rles(y, y_c, y_m): writer.writerow([uid, ' '.join([str(i) for i in rle])]) elif toiou: assert target != 'test' if writer is None: csvfile = open('iou.csv', 'w') writer = csv.writer(csvfile) writer.writerow(['ImageId', 'IoU']) iou = get_iou(y, y_c, y_m, gt) writer.writerow([uid, iou]) ious.append(iou) elif mask: save_mask(uid, y, y_c, y_m) elif target == 'test': show(uid, x, y, y_c, y_m, save) else: # train or valid show_groundtruth(uid, x, y, y_c, y_m, gt, gt_s, gt_c, gt_m, save) # end of for-loop if csvfile is not None: csvfile.close() if toiou: print('\nIoU Metrics:\n mean: {0:.4f}\t std: {1:.4f}\t max: {2:.4f}\t min: {3:.4f}\t count: {4}\n' .format(np.mean(ious), np.std(ious), np.max(ious), np.min(ious), len(ious)))
map_location=lambda storage, loc: storage) net2.load_state_dict(state_dict, strict=True) moving_average(net, net2, 1. / (i + 2)) ## dataset ---------------------------------------- files_train = [f'train_image_data_{fid}.feather' for fid in range(4)] data = read_data(args.data_dir, files_train) df = pd.read_csv(args.df_path) train_split = np.load(args.data_dir + '/train_b_fold1_184855.npy').tolist() train_df = df[df['image_id'].isin(train_split)] train_dataset = KaggleDataset( df=df, data=data, idx=train_df.index.values, augment=train_augment if args.use_gridmask else valid_augment, ) train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=args.batch_size, drop_last=True, num_workers=4, pin_memory=True, collate_fn=null_collate) net.cuda() bn_update(train_loader, net) torch.save(net.state_dict(), args.out_dir + f'/{args.model_name}/' + output_name)
# print('id_ ' + id_ + '\t' + f_content[id_]) # debug fn.write(f_content[id_]) fn.close() # _ _ ____ ____ _ _____ _____ ____ ____ __ __ # | | | | | _ \ | _ \ / \ |_ _| | ____| / ___| / ___| \ \ / / # | | | | | |_) | | | | | / _ \ | | | _| _____ | | \___ \ \ \ / / # | |_| | | __/ | |_| | / ___ \ | | | |___ |_____| | |___ ___) | \ V / # \___/ |_| |____/ /_/ \_\ |_| |_____| \____| |____/ \_/ # CSV_FILE = re.sub('\.csv','',CSV_OUT_PATH) dataset_test = KaggleDataset(CSV_FILE,transform=Compose(), img_folder= TEST_IMG_DIR, resize_scale=[128,128]) confidence_alert = 0 valid_idx = range(dataset_test.__len__()) valid_loader = DataLoader(dataset_test, sampler=SubsetRandomSampler(valid_idx),batch_size=4,num_workers=2) # network net = VGG('VGG16') print(net) net.cuda() net.eval() net.load_state_dict(torch.load(MODEL_IN_PATH)) invert_majorlabel = {v:k for k,v in dataset_test.majorlabels.items()} for i, data in enumerate(valid_loader, 0):
def main(resume=True, n_epoch=None, learn_rate=None): model_name = config['param']['model'] cv_ratio = config['param'].getfloat('cv_ratio') if learn_rate is None: learn_rate = config['param'].getfloat('learn_rate') width = config[model_name].getint('width') weight_map = config['param'].getboolean('weight_map') c = config['train'] log_name = c.get('log_name') n_batch = c.getint('n_batch') n_worker = c.getint('n_worker') n_ckpt_epoch = c.getint('n_ckpt_epoch') if n_epoch is None: n_epoch = c.getint('n_epoch') # initialize model if model_name == 'unet_vgg16': model = UNetVgg16(3, 1, fixed_vgg=True) elif model_name == 'dcan': model = DCAN(3, 1) elif model_name == 'caunet': model = CAUNet() elif model_name == 'camunet': model = CAMUNet() else: model = UNet() if torch.cuda.is_available(): model = model.cuda() # model = torch.nn.DataParallel(model).cuda() # define optimizer optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.learn_rate, weight_decay=1e-6 ) # dataloader workers are forked process thus we need a IPC manager to keep cache in same memory space manager = Manager() cache = manager.dict() # prepare dataset and loader dataset = KaggleDataset('data/stage1_train', transform=Compose(), cache=cache) train_idx, valid_idx = dataset.split() train_loader = DataLoader( dataset, sampler=SubsetRandomSampler(train_idx), batch_size=n_batch, num_workers=n_worker, pin_memory=torch.cuda.is_available()) valid_loader = DataLoader( dataset, sampler=SubsetRandomSampler(valid_idx), batch_size=n_batch, num_workers=n_worker) # resume checkpoint start_epoch = 0 if resume: start_epoch = load_ckpt(model, optimizer) if start_epoch == 0: print('Grand new training ...') # decide log directory name log_dir = os.path.join( 'logs', log_name, '{}-{}'.format(model_name, width), 'ep_{},{}-lr_{}'.format( start_epoch, n_epoch + start_epoch, learn_rate, ) ) with SummaryWriter(log_dir) as writer: if start_epoch == 0 and False: # dump graph only for very first training, disable by default dump_graph(model, writer, n_batch, width) print('Training started...') for epoch in range(start_epoch, n_epoch + start_epoch): train(train_loader, model, optimizer, epoch, writer) if cv_ratio > 0 and epoch % 3 == 2: valid(valid_loader, model, epoch, writer, len(train_loader)) # save checkpoint per n epoch if epoch % n_ckpt_epoch == n_ckpt_epoch - 1: save_ckpt(model, optimizer, epoch+1) print('Training finished...')
def run_train(args): out_dir = args.out_dir + '/' + args.model_name use_gridmask = args.use_gridmask initial_checkpoint = args.initial_checkpoint if args.scheduler_name == 'null': schduler = NullScheduler(lr=0.001) else: schduler = CyclicScheduler0(min_lr=0.00001, max_lr=0.00005, period=750, ratio=1 ) iter_accum = 1 batch_size = args.batch_size # set-up directories for f in ['checkpoint'] : os.makedirs(out_dir +'/'+f, exist_ok=True) log = Logger() log.open(out_dir+'/log.train.txt',mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') ## dataset ---------------------------------------- log.write('** dataset setting **\n') files_train = [f'train_image_data_{fid}.feather' for fid in range(4)] data = read_data(args.data_dir, files_train) df = pd.read_csv(args.df_path) train_split = np.load(args.data_dir + '/train_b_fold1_184855.npy').tolist() valid_split = np.load(args.data_dir + '/valid_b_fold1_15985.npy').tolist() train_df = df[df['image_id'].isin(train_split)] valid_df = df[df['image_id'].isin(valid_split)] train_dataset = KaggleDataset( df = df, data = data, idx = train_df.index.values, augment = train_augment if use_gridmask else valid_augment, ) train_loader = DataLoader( train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size, drop_last = True, num_workers = 4, pin_memory = True, collate_fn = null_collate ) valid_dataset = KaggleDataset( df = df, data = data, idx = valid_df.index.values, augment = valid_augment, ) valid_loader = DataLoader( valid_dataset, sampler = SequentialSampler(valid_dataset), batch_size = batch_size, drop_last = False, num_workers = 4, pin_memory = True, collate_fn = null_collate ) assert(len(train_dataset)>=batch_size) log.write('batch_size = %d\n'%(batch_size)) log.write('\n') ## net ---------------------------------------- log.write('** net setting **\n') if args.model_name == 'serex50': net = Serex50_Net().cuda() elif args.model_name == 'effnetb3': net = EfficientNet_3().cuda() else: raise NotImplemented log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint) if initial_checkpoint is not None: state_dict = torch.load(initial_checkpoint, map_location=lambda storage, loc: storage) net.load_state_dict(state_dict,strict=True) else: if args.model_name == 'serex50': net.load_pretrain(is_print=False) else: pass log.write('net=%s\n'%(type(net))) log.write('\n') if args.optimizer_name == 'AdamW': optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()),lr=schduler(0), weight_decay=1e-4) else: optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=schduler(0), momentum=0.0, weight_decay = 1e-4) num_iters = 3000*1000 iter_smooth = 50 iter_log = 250 iter_valid = 500 iter_save = [0, num_iters-1]\ + list(range(0, num_iters, 1000))#1*1000 start_iter = 0 start_epoch= 0 rate = 0 if initial_checkpoint is not None: initial_optimizer = initial_checkpoint.replace('_model.pth','_optimizer.pth') if os.path.exists(initial_optimizer): checkpoint = torch.load(initial_optimizer) start_iter = checkpoint['iter' ] start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) pass log.write('optimizer\n %s\n'%(optimizer)) log.write('schduler\n %s\n'%(schduler)) log.write('\n') ## start training here! ############################################## log.write('** start training here! **\n') log.write(' batch_size=%d, iter_accum=%d\n'%(batch_size,iter_accum)) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) log.write(' |----------------------- VALID------------------------------------|------- TRAIN/BATCH -----------\n') log.write('rate iter epoch | kaggle | loss acc | loss | time \n') log.write('----------------------------------------------------------------------------------------------------------------------\n') def message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='print'): if mode==('print'): asterisk = ' ' loss = batch_loss if mode==('log'): asterisk = '*' if iter in iter_save else ' ' loss = train_loss text = \ '%0.5f %5.1f%s %4.1f | '%(rate, iter/1000, asterisk, epoch,) +\ '%0.4f : %0.4f %0.4f %0.4f | '%(kaggle[1],*kaggle[0]) +\ '%4.4f, %4.4f, %4.4f : %4.4f, %4.4f, %4.4f | '%(*valid_loss,) +\ '%4.4f, %4.4f, %4.4f |'%(*loss,) +\ '%s' % (time_to_str((timer() - start_timer),'min')) return text kaggle = (0,0,0,0) valid_loss = np.zeros(6,np.float32) train_loss = np.zeros(3,np.float32) batch_loss = np.zeros_like(train_loss) iter = 0 i = 0 start_timer = timer() while iter<num_iters: sum_train_loss = np.zeros_like(train_loss) sum_train = np.zeros_like(train_loss) optimizer.zero_grad() for t, (input, truth, infor) in enumerate(train_loader): input, truth, shuffled_truth, lam = cutmix(input, truth,alpha=0.3) batch_size = len(infor) iter = i + start_iter epoch = (iter-start_iter)*batch_size/len(train_dataset) + start_epoch if (iter % iter_valid==0): valid_loss, kaggle = do_valid(net, valid_loader, out_dir) # pass if (iter % iter_log==0): print('\r',end='',flush=True) log.write(message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='log')) log.write('\n') if iter in iter_save: torch.save({ 'optimizer': optimizer.state_dict(), 'iter' : iter, 'epoch' : epoch, }, out_dir +'/checkpoint/%08d_optimizer.pth'%(iter)) if iter!=start_iter: torch.save(net.state_dict(),out_dir +'/checkpoint/%08d_model.pth'%(iter)) pass # learning rate schduler ------------- lr = schduler(iter) if lr<0 : break adjust_learning_rate(optimizer, lr) rate = get_learning_rate(optimizer) net.train() input = input.cuda() truth = [t.cuda() for t in truth] shuffled_truth = [t.cuda() for t in shuffled_truth] logit = net(input) probability = logit_to_probability(logit) loss = cutmix_criterion(logit, truth, shuffled_truth, lam) ((loss[0]+loss[1]+loss[2] )/iter_accum).backward() if (iter % iter_accum)==0: optimizer.step() optimizer.zero_grad() loss = [l.item() for l in loss] l = np.array([ *loss, ])*batch_size n = np.array([ 1, 1, 1 ])*batch_size batch_loss = l/(n+1e-8) sum_train_loss += l sum_train += n if iter%iter_smooth == 0: train_loss = sum_train_loss/(sum_train+1e-12) sum_train_loss[...] = 0 sum_train[...] = 0 print('\r',end='',flush=True) print(message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='print'), end='',flush=True) i=i+1 pass #-- end of one data loader -- pass #-- end of all iterations -- log.write('\n')
def main(): # wandb.init(project="kaggle_cassava_leaf_disease_classification") # wandb.run.name = args.config # wandb.save(f"configs/{args.config}.py") os.makedirs(f"./checkpoint/{config.dir}", exist_ok=True) config.fold_num = args.fold_num print(config.fold_num) invalid_ids = ['274726002.jpg', '9224019.jpg', '159654644.jpg', '199112616.jpg', '226533928.jpg', '262902341.jpg', '269713568.jpg', '384390206.jpg', '390601409.jpg', '421035788.jpg', '457405364.jpg', '600736721.jpg', '580111608.jpg', '616718743.jpg', '695438825.jpg', '723564013.jpg', '826231979.jpg', '847847826.jpg', '927165736.jpg', '1004389140.jpg', '1008244905.jpg', '1338159402.jpg', '1339403533.jpg', '1366430957.jpg', '9224019.jpg', '4269208386.jpg', '4239074071.jpg', '3810809174.jpg', '3652033201.jpg', '3609350672.jpg', '3609986814.jpg', '3477169212.jpg', '3435954655.jpg', '3425850136.jpg', '3251960666.jpg', '3252232501.jpg', '3199643560.jpg', '3126296051.jpg', '3040241097.jpg', '2981404650.jpg', '2925605732.jpg', '2839068946.jpg', '2698282165.jpg', '2604713994.jpg', '2415837573.jpg', '2382642453.jpg', '2321669192.jpg', '2320471703.jpg', '2278166989.jpg', '2276509518.jpg', '2262263316.jpg', '2182500020.jpg', '2139839273.jpg', '2084868828.jpg', '1848686439.jpg', '1689510013.jpg', '1359893940.jpg'] if config.use_prev_data: df = pd.read_csv("./data/merged.csv") df = df[~df.image_id.isin(invalid_ids)] # df_20 = df.loc[(df["source"] == 2020)].copy().reset_index(drop=True) # df_20["data_dir"] = "train_images" df_20 = df.loc[(df["source"] == 2020) & (df["label"] != 3)].copy().reset_index(drop=True) df_20["data_dir"] = "train_images" df_20_3 = df.loc[(df["source"] == 2020) & (df["label"] == 3)].copy().reset_index(drop=True) df_20_3 = df_20_3.sample(frac=0.7) df_20_3["data_dir"] = "train_images" df_19_0 = df.loc[(df["source"] == 2019) & (df["label"] == 0)].copy().reset_index(drop=True) df_19_0["data_dir"] = "train/cbb/" df_19_2 = df.loc[(df["source"] == 2019) & (df["label"] == 2)].copy().reset_index(drop=True) df_19_2["data_dir"] = "train/cgm/" df_19_4 = df.loc[(df["source"] == 2019) & (df["label"] == 4)].copy().reset_index(drop=True) df_19_4["data_dir"] = "train/healthy/" df = pd.concat([df_20, df_20_3, df_19_0, df_19_2, df_19_4], axis=0).reset_index(drop=True).sample(frac=0.2) # df = pd.concat([df_20, df_19_0, df_19_2, df_19_4], axis=0).reset_index(drop=True) else: df = pd.read_csv("./data/train.csv") df["kfold"] = -1 df = df.sample(frac=1).reset_index(drop=True) y = df["label"].values skf = StratifiedKFold(n_splits=5) for (fold_num), (train_index, val_index) in enumerate(skf.split(X=df, y=y)): df.loc[df.iloc[val_index].index, "kfold"] = fold_num train_df = df.loc[df["kfold"] != args.fold_num].reset_index(drop=True).copy() valid_df = df.loc[df["kfold"] == args.fold_num].reset_index(drop=True).copy() print("finish data setting") print(train_df.head()) print(valid_df.head()) train_dataset = KaggleDataset( df=train_df, transforms=config.train_transforms, preprocessing=config.preprocessing, mode="train", ind=False, ) validation_dataset = KaggleDataset( df=valid_df, transforms=config.valid_transforms, preprocessing=config.preprocessing, mode="val", ind=False, ) train_loader = DataLoader( train_dataset, # sampler=BalanceClassSampler(labels=train_dataset.get_labels(), mode="upsampling"), batch_size=45, pin_memory=True, num_workers=4, ) valid_loader = DataLoader( validation_dataset, batch_size=32, pin_memory=True, num_workers=4, ) print("model setting") s_net = CassavaNet(net_type=config.net_type, pretrained=True, bn=config.bn) t_net = MODEL_LIST["effcientnet"](net_type="tf_efficientnet_b4_ns", pretrained=True, bn=False) ch = torch.load(chs[args.fold_num]) t_net.load_state_dict(ch["model_state_dict"], strict=True) t_net = t_net.cuda() t_net.eval() optimizer, scheduler = get_optimizer(s_net, config.optimizer_name, config.optimizer_params, config.scheduler_name, config.scheduler_params, config.n_epochs) criterion = SoftTarget(T=4.0).cuda() # wandb.watch(net, log="all") logname = f"checkpoint/{config.dir}/" + s_net.__class__.__name__ + \ '_' + "stage2_" + f'{args.fold_num}.csv' if not os.path.exists(logname): with open(logname, 'w') as logfile: logwriter = csv.writer(logfile, delimiter=',') logwriter.writerow( ['epoch', 'train loss', 'train acc', 'test loss', 'test acc']) start_epoch=0 s_net = s_net.to(device) for epoch in range(start_epoch, config.n_epochs): print("lr: ", optimizer.param_groups[0]['lr']) if epoch < config.freeze_bn_epoch: print("freeze_batch_norm") s_net.freeze_batchnorm_stats() train_loss, train_acc = train(epoch, train_loader, t_net, s_net, criterion, optimizer) test_loss, test_acc = test(epoch, valid_loader, t_net, s_net, criterion) with open(logname, 'a') as logfile: logwriter = csv.writer(logfile, delimiter=',') logwriter.writerow([epoch, train_loss, train_acc, test_loss, test_acc]) scheduler.step()