def setup(args, cfg, train_ann, valid_ann): logger = logging.getLogger('root') train_loader = builder.build_dataloader(cfg, ann=train_ann, mode='train') valid_loader = builder.build_dataloader(cfg, ann=valid_ann, mode='valid') # Adjust steps per epoch if necessary (i.e., equal to 0) # We assume if gradient accumulation is specified, then the user # has already adjusted the steps_per_epoch accordingly in the # config file steps_per_epoch = cfg['train']['params']['steps_per_epoch'] gradient_accmul = cfg['train']['params']['gradient_accumulation'] if steps_per_epoch == 0: cfg['train']['params']['steps_per_epoch'] = len(train_loader) logger.info('Building [{}] architecture ...'.format( cfg['model']['config'])) model = builder.build_model(cfg, args.gpu) model = model.train().cuda() optimizer = builder.build_optimizer(cfg['optimizer']['name'], model, cfg['optimizer']['params']) scheduler = builder.build_scheduler(cfg['scheduler']['name'], optimizer, cfg=cfg) return cfg, \ train_loader, \ valid_loader, \ model, \ optimizer, \ scheduler
def test(args, cfg, test_df): if 'csv_filename' in cfg['test'].keys(): if cfg['test']['csv_filename']: test_df = pd.read_csv(cfg['test']['csv_filename']) logger = logging.getLogger('root') logger.info('TESTING : START') logger.info('TEST: n={}'.format(len(test_df))) if 'data_dir' in cfg['test'].keys(): if cfg['test']['data_dir']: cfg['dataset']['data_dir'] = cfg['test']['data_dir'] test_df = test_df[test_df['part'] != 45] test_images = [ osp.join(cfg['dataset']['data_dir'], '{}'.format(_)) for _ in test_df['vidfile'] ] test_labels = np.asarray(test_df['label']) test_loader = builder.build_dataloader(cfg, data_info={ 'vidfiles': test_images, 'labels': test_labels }, mode='test') cfg['model']['params']['pretrained'] = None model = builder.build_model(cfg['model']['name'], cfg['model']['params']) model.load_state_dict( torch.load(cfg['test']['checkpoint'], map_location=lambda storage, loc: storage)) model = model.eval().cuda() if 'params' not in cfg['test'].keys() or type( cfg['test']['params']) == type(None): cfg['test']['params'] = {} predictor = getattr(factory_evaluate, cfg['test']['predictor']) predictor = predictor(loader=test_loader, **cfg['test']['params']) y_true, y_pred, _ = predictor.predict(model, criterion=None, epoch=None) if not osp.exists(cfg['test']['save_preds_dir']): os.makedirs(cfg['test']['save_preds_dir']) with open(osp.join(cfg['test']['save_preds_dir'], 'predictions.pkl'), 'wb') as f: pickle.dump( { 'y_true': y_true, 'y_pred': y_pred, 'imgfiles': [im.split('/')[-1] for im in test_images] }, f)
def predict(args, cfg): df = pd.read_csv(cfg['predict']['csv_filename']) logger = logging.getLogger('root') logger.info('PREDICT : START') logger.info('PREDICT: n={}'.format(len(df))) images = [osp.join(cfg['predict']['data_dir'], _) for _ in df['imgfile']] male = list(df['male'].astype('float32')) if cfg['predict']['coords']: coords = {k : np.asarray(df[k]) for k in ['x1','y1','x2','y2']} else: coords = None loader = builder.build_dataloader(cfg, data_info={'imgfiles': images, 'labels': [0]*len(images), 'male': male, 'coords': coords}, mode='predict') model = builder.build_model(cfg['model']['name'], cfg['model']['params']) model.load_state_dict(torch.load(cfg['predict']['checkpoint'], map_location=lambda storage, loc: storage)) model = model.eval().cuda() if cfg['predict']['params'] is None: cfg['predict']['params'] = {} if 'patch' in cfg['evaluation']['params'].keys(): cfg['predict']['params']['patch'] = cfg['evaluation']['params']['patch'] predictor = getattr(factory.evaluate, cfg['predict']['predictor']) predictor = predictor(loader=loader, **cfg['predict']['params']) _, y_pred, _ = predictor.predict(model, criterion=None, epoch=None) if 'percentile' in cfg['predict'].keys() and cfg['predict']['params']['patch']: y_pred = np.percentile(y_pred, cfg['predict']['percentile'], axis=1) if not osp.exists(cfg['predict']['save_preds_dir']): os.makedirs(cfg['predict']['save_preds_dir']) with open(osp.join(cfg['predict']['save_preds_dir'], 'predictions.pkl'), 'wb') as f: pickle.dump({ 'y_pred': y_pred, 'imgfiles': [im.split('/')[-1] for im in images] }, f)
def prepare_input(self, image): # Need to save temporary image file rand_img = os.path.join("/tmp", "img-{}.png".format(np.random.randint(1e10))) # Assuming image is (H, W) if image.ndim == 2: X = np.expand_dims(image, axis=-1) X = np.repeat(X, 3, axis=-1) else: X = image cv2.imwrite(rand_img, X) ann = [{ "filename": rand_img, "height": X.shape[0], "width": X.shape[1] }] self.cfg["dataset"]["data_dir"] = "/tmp" loader = builder.build_dataloader(self.cfg, ann, mode="predict") data = loader.dataset[0] img = data.pop("img") img_meta = {0: data} os.system("rm {}".format(rand_img)) return img, img_meta
def predict(args, cfg): logger = logging.getLogger('root') logger.info('PREDICT : START') if 'model_weights' not in cfg.keys() or cfg['model_weights'] is None: cfg['model_weights'] = [1.] * len(cfg['model_configs']) assert len(cfg['model_weights']) == len(cfg['model_configs']) if 'data_dir' in cfg['predict'].keys(): if cfg['predict']['data_dir']: cfg['dataset']['data_dir'] = cfg['predict']['data_dir'] if type(cfg['predict']['path_to_parquet']) != list: cfg['predict']['path_to_parquet'] = list( cfg['predict']['path_to_parquet']) assert cfg['dataset']['name'] == 'BengaliParquetDataset' predict_parquets = [ osp.join(cfg['dataset']['data_dir'], path_to_parquet) for path_to_parquet in cfg['predict']['path_to_parquet'] ] model_configs = [] for cfgfile in cfg['model_configs']: with open(cfgfile) as f: model_cfg = yaml.load(f, Loader=yaml.FullLoader) model_cfg['model']['params']['pretrained'] = None if 'grapheme_model_checkpoint' in model_cfg['model']['params'].keys(): model_cfg['model']['params']['grapheme_model_checkpoint'] = None model_configs.append(model_cfg) def create_model(cfg): model = builder.build_model(cfg['model']['name'], cfg['model']['params']) model.load_state_dict( torch.load(cfg['test']['checkpoint'], map_location=lambda storage, loc: storage)) model = model.eval().cuda() return model models = [create_model(model_cfg) for model_cfg in model_configs] cfg['predict']['labels_available'] = False image_ids = [] y_pred_list = [] for parquet in predict_parquets: loader = builder.build_dataloader( cfg, data_info={'path_to_parquet': parquet}, mode='predict') image_ids.extend(list(loader.dataset.image_ids)) predictor = getattr(factory_evaluate, cfg['predict']['predictor']) predictor = predictor(loader=loader, **cfg['predict']['params']) y_pred = [] for m in models: _, single_y_pred, _ = predictor.predict(m, criterion=None, epoch=None) y_pred.append(single_y_pred) y_pred_list.append(y_pred) weights = np.asarray(cfg['model_weights']) weights /= weights.sum() averaged_pred_list = [] for y_pred in y_pred_list: averaged = copy.deepcopy(y_pred[0]) for k, v in averaged.items(): averaged[k] = v * weights[0] for ind, each_y_pred in enumerate(y_pred[1:]): for k, v in each_y_pred.items(): averaged[k] += v * weights[ind + 1] averaged_pred_list.append(averaged) combined_preds = averaged_pred_list[0] for pred in averaged_pred_list[1:]: for k in combined_preds.keys(): combined_preds[k] = np.vstack((combined_preds[k], pred[k])) row_id = [] target = [] for ind, i in enumerate(image_ids): for k in combined_preds.keys(): row_id.append('{}_{}'.format(i, k)) target.append(np.argmax(combined_preds[k][ind])) submission = pd.DataFrame({'row_id': row_id, 'target': target}) submission.to_csv(cfg['predict']['submission_csv'], index=False)
def setup(args, cfg, train_df, valid_df): logger = logging.getLogger('root') if cfg['dataset']['name'] == 'FaceMaskDataset': train_images = [ osp.join(cfg['dataset']['data_dir'], '{}'.format(_)) for _ in train_df['imgfile'] ] valid_images = [ osp.join(cfg['dataset']['data_dir'], '{}'.format(_)) for _ in valid_df['imgfile'] ] train_labels = np.asarray(train_df['label']) valid_labels = np.asarray(valid_df['label']) train_masks = [ osp.join(cfg['dataset']['data_dir'], '{}'.format(_)) for _ in train_df['maskfile'] ] valid_masks = [ osp.join(cfg['dataset']['data_dir'], '{}'.format(_)) for _ in valid_df['maskfile'] ] train_loader = builder.build_dataloader(cfg, data_info={ 'imgfiles': train_images, 'maskfiles': train_masks, 'labels': train_labels }, mode='train') valid_loader = builder.build_dataloader(cfg, data_info={ 'imgfiles': valid_images, 'maskfiles': valid_masks, 'labels': valid_labels }, mode='valid') else: train_images = [ osp.join(cfg['dataset']['data_dir'], '{}'.format(_)) for _ in train_df['vidfile'] ] valid_images = [ osp.join(cfg['dataset']['data_dir'], '{}'.format(_)) for _ in valid_df['vidfile'] ] train_labels = np.asarray(train_df['label']) valid_labels = np.asarray(valid_df['label']) train_loader = builder.build_dataloader(cfg, data_info={ 'vidfiles': train_images, 'labels': train_labels }, mode='train') valid_loader = builder.build_dataloader(cfg, data_info={ 'vidfiles': valid_images, 'labels': valid_labels }, mode='valid') # Adjust steps per epoch if necessary (i.e., equal to 0) # We assume if gradient accumulation is specified, then the user # has already adjusted the steps_per_epoch accordingly in the # config file steps_per_epoch = cfg['train']['params']['steps_per_epoch'] gradient_accmul = cfg['train']['params']['gradient_accumulation'] if steps_per_epoch == 0: cfg['train']['params']['steps_per_epoch'] = len(train_loader) # if gradient_accmul > 1: # new_steps_per_epoch = int(cfg['train']['params']['steps_per_epoch'] # / gradient_accmul) # cfg['train']['params']['steps_per_epoch'] = new_steps_per_epoch # Generic build function will work for model/loss logger.info('Building [{}] architecture ...'.format(cfg['model']['name'])) if 'backbone' in cfg['model']['params'].keys(): logger.info(' Using [{}] backbone ...'.format( cfg['model']['params']['backbone'])) if 'pretrained' in cfg['model']['params'].keys(): logger.info(' Pretrained weights : {}'.format( cfg['model']['params']['pretrained'])) model = builder.build_model(cfg['model']['name'], cfg['model']['params']) model = model.train().cuda() if cfg['loss']['params'] is None: cfg['loss']['params'] = {} if re.search(r'^OHEM', cfg['loss']['name']): cfg['loss']['params']['total_steps'] = cfg['train']['params'][ 'num_epochs'] * cfg['train']['params']['steps_per_epoch'] criterion = builder.build_loss(cfg['loss']['name'], cfg['loss']['params']) optimizer = builder.build_optimizer(cfg['optimizer']['name'], model.parameters(), cfg['optimizer']['params']) scheduler = builder.build_scheduler(cfg['scheduler']['name'], optimizer, cfg=cfg) if len(args.gpu) > 1: model = nn.DataParallel(model, device_ids=args.gpu) return cfg, \ train_loader, \ valid_loader, \ model, \ optimizer, \ criterion, \ scheduler
'consonant_diacritic': train_df['consonant_diacritic'].iloc[rownum] } train_labels.append(rowlabel) valid_labels = [] for rownum in range(len(valid_df)): rowlabel = { 'grapheme_root': valid_df['grapheme_root'].iloc[rownum], 'vowel_diacritic': valid_df['vowel_diacritic'].iloc[rownum], 'consonant_diacritic': valid_df['consonant_diacritic'].iloc[rownum] } valid_labels.append(rowlabel) train_loader = builder.build_dataloader(cfg, data_info={ 'imgfiles': train_images, 'labels': train_labels }, mode='train') valid_loader = builder.build_dataloader(cfg, data_info={ 'imgfiles': valid_images, 'labels': valid_labels }, mode='valid') model = builder.build(models, cfg['model']['name'], cfg['model']['params']) model = model.train().cuda() criterion = builder.build(losses, cfg['loss']['name'], cfg['loss']['params']) predictor = evaluate.Predictor(loader=valid_loader) y_true, y_pred = predictor.predict(model, criterion, 0)
osp.join(cfg['dataset']['data_dir'], _) for _ in valid_df['imgfile'] ] train_labels = list(train_df['boneage']) valid_labels = list(valid_df['boneage']) train_male = list(train_df['male'].astype('float32')) valid_male = list(valid_df['male'].astype('float32')) train_coords = {k: np.asarray(train_df[k]) for k in ['x1', 'y1', 'x2', 'y2']} valid_coords = {k: np.asarray(valid_df[k]) for k in ['x1', 'y1', 'x2', 'y2']} train_loader = builder.build_dataloader(cfg, data_info={ 'imgfiles': train_images, 'labels': train_labels, 'male': train_male, 'coords': train_coords }, mode='train') valid_loader = builder.build_dataloader(cfg, data_info={ 'imgfiles': valid_images, 'labels': valid_labels, 'male': valid_male, 'coords': valid_coords }, mode='valid') train_iter = iter(train_loader) valid_iter = iter(valid_loader)
def predict_kfold(args, cfg): if args.save_file: cfg['predict']['savefile'] = args.save_file if args.model_config: cfg['model_configs'] = [args.model_config] if args.checkpoint_dir: # format will be config/backbone/fold/checkpoint.pth checkpoints = glob.glob(osp.join(args.checkpoint_dir, 'fold*', '*.PTH')) checkpoint_dict = defaultdict(list) for ckpt in checkpoints: fold = ckpt.split('/')[-2] checkpoint_dict[fold] += [ckpt] best_metric_per_fold = [] for k,v in checkpoint_dict.items(): metrics = [float(_.split('/')[-1].split('-')[-1].replace('.PTH', '')) for _ in v] checkpoint_dict[k] = v[np.argmax(metrics)] best_metric_per_fold += [np.max(metrics)] print(f'KFOLD CV: {np.mean(best_metric_per_fold):.4f}') cfg['model_checkpoints'] = [v for v in checkpoint_dict.values()] save_dir = osp.dirname(cfg['predict']['savefile']) if not osp.exists(save_dir): os.makedirs(save_dir) print(f'Saving predictions to {cfg["predict"]["savefile"]} ...') # Get data directory data_dir = cfg['dataset']['data_dir'] if 'csv_filename' in cfg['dataset']: df = pd.read_csv(cfg['dataset']['csv_filename']) images = [osp.join(data_dir, f'{_}.jpg') for _ in df['image_name']] else: images = glob.glob(osp.join(osp.join(data_dir, '*'))) data_info = { 'imgfiles': images, 'labels': [0] * len(images), } if cfg['dataset'].pop('meta', False): data_info['meta'] = [dict(age=row['age_cat'],sex=row['sex'],ant=row['anatom_site_general_challenge']) for rownum, row in df.iterrows()] if 'params' not in cfg['predict'] or not isinstance(cfg['predict']['params'], dict): cfg['predict']['params'] = {} print('PREDICTING : START') print('PREDICT (N={})'.format(len(images))) assert 'model_checkpoints' in cfg and isinstance(cfg['model_checkpoints'], list) assert len(np.unique(cfg['model_configs'])) == 1, 'Different model config files have been specified, please use `predict()`' with open(cfg['model_configs'][0]) as f: model_cfg = yaml.load(f, Loader=yaml.FullLoader) if args.backbone: model_cfg['model']['params']['backbone'] = args.backbone model_cfg['model']['params']['pretrained'] = None model_cfg['transform']['augment'] = None model_cfg['transform']['params'] = None model_cfg['transform']['num_workers'] = args.num_workers model_cfg['dataset'] = cfg['dataset'] loader = builder.build_dataloader(model_cfg, data_info=data_info, mode='predict') def create_model(cfg, checkpoint): model = builder.build_model(cfg['model']['name'], cfg['model']['params']) print('Loading <{}> model from <{}> ...'.format(cfg['model']['name'], checkpoint)) weights = torch.load(checkpoint, map_location=lambda storage, loc: storage) weights = {k.replace('module.', '') : v for k,v in weights.items()} model.load_state_dict(weights) model = model.eval().cuda() return model models = [create_model(model_cfg, ckpt) for ckpt in cfg['model_checkpoints']] print(f'{len(models)} models will be used for inference ...') final_preds = [] for batch, labels in tqdm(loader): model_preds = [] batch, labels = cudaify(batch, labels) for m in models: with torch.no_grad(): output = m(batch) if m.fc.out_features > 1: output = torch.softmax(output, dim=1) else: output = torch.sigmoid(output) model_preds += [output.cpu().numpy()] final_preds += [model_preds] pickled = { 'image_id': [_.split('/')[-1] for _ in images], 'label': final_preds } with open(cfg['predict']['savefile'], 'wb') as f: pickle.dump(pickled, f)
def predict(args, cfg): # Get data directory data_dir = cfg['dataset']['data_dir'] if 'csv_filename' in cfg['dataset']: df = pd.read_csv(cfg['dataset']['csv_filename']) images = [osp.join(data_dir, f'{_}.jpg') for _ in df['image_name']] else: images = glob.glob(osp.join(osp.join(data_dir, '*'))) data_info = { 'imgfiles': images, 'labels': [0] * len(images), } if cfg['dataset'].pop('meta', False): data_info['meta'] = [dict(age=row['age_cat'],sex=row['sex'],ant=row['anatom_site_general_challenge']) for rownum, row in df.iterrows()] if 'params' not in cfg['predict'] or not isinstance(cfg['predict']['params'], dict): cfg['predict']['params'] = {} loader = builder.build_dataloader(cfg, data_info=data_info, mode='predict') if 'arc' in cfg and cfg['arc']: assert 'arc_csvfile' in cfg['dataset'] assert 'arc_datadir' in cfg['dataset'] df = pd.read_csv(cfg['dataset']['arc_csvfile']) mel_df = df[df['label'] == 1].drop_duplicates() arc_info = { 'imgfiles': [osp.join(cfg['dataset']['arc_datadir'], f'{_}.jpg') for _ in mel_df['image']], 'labels': mel_df['label'].values } arc_loader = builder.build_dataloader(cfg, data_info=arc_info, mode='predict') cfg['predict']['params']['arc_loader'] = arc_loader print('PREDICTING : START') print('PREDICT (N={})'.format(len(images))) # Replace checkpoints, if necessary if 'model_checkpoints' in cfg and type(cfg['model_checkpoints']) != type(None): assert len(cfg['model_checkpoints']) == len(cfg['model_configs']) assert type(cfg['model_checkpoints']) == list replace_checkpoint_paths = True model_configs = [] for cfg_ind, cfgfile in enumerate(cfg['model_configs']): with open(cfgfile) as f: model_cfg = yaml.load(f, Loader=yaml.FullLoader) model_cfg['model']['params']['pretrained'] = None if replace_checkpoint_paths: model_cfg['test']['checkpoint'] = cfg['model_checkpoints'][cfg_ind] model_configs.append(model_cfg) def create_model(cfg): model = builder.build_model(cfg['model']['name'], cfg['model']['params']) print('Loading <{}> model from <{}> ...'.format(cfg['model']['name'], cfg['test']['checkpoint'])) checkpoint = cfg['test']['checkpoint'] weights = torch.load(checkpoint, map_location=lambda storage, loc: storage) weights = {k.replace('module.', '') : v for k,v in weights.items()} model.load_state_dict(weights) model = model.eval().cuda() return model models = [create_model(model_cfg) for ind, model_cfg in enumerate(model_configs)] predictor = getattr(factory_evaluate, cfg['predict']['predictor']) predictor = predictor(loader=loader, **cfg['predict']['params']) final_pred = [] for m in models: if 'tta' in cfg and cfg['tta']: m.tta = True _, y_pred, _ = predictor.predict(m, criterion=None, epoch=None) final_pred += [y_pred] #final_pred = np.mean(np.asarray(final_pred), axis=0) save_dir = osp.dirname(cfg['predict']['savefile']) if not osp.exists(save_dir): os.makedirs(save_dir) pickled = { 'image_id': [_.split('/')[-1] for _ in images], 'label': final_pred } with open(cfg['predict']['savefile'], 'wb') as f: pickle.dump(pickled, f)
def test(args, cfg, train_df, test_df): test_df = test_df.drop_duplicates() data_info = { 'imgfiles': [osp.join(cfg['dataset']['data_dir'], f'{_}.jpg') for _ in test_df['image'].values], 'labels': test_df['label'].values } loader = builder.build_dataloader(cfg, data_info=data_info, mode='test') ARC = False if cfg['model']['name'] in ('ArcNet', 'SiameseNet'): ARC = True if 'use_benign' in cfg['test'] and cfg['test']['use_benign']: LBL = 0 else: LBL = 1 if 'isic' in train_df.columns: mel_df = train_df[(train_df['label'] == LBL) & (train_df['isic'] == 2020)] else: mel_df = train_df[train_df['label'] == LBL] mel_df = mel_df.drop_duplicates() arc_data_info = { 'imgfiles': [osp.join(cfg['dataset']['data_dir'], f'{_}.jpg') for _ in mel_df['image'].values], 'labels': mel_df['label'].values } arc_loader = builder.build_dataloader(cfg, data_info=arc_data_info, mode='predict') print(f'{len(arc_loader)} melanoma examples will be used as reference ...') print('TESTING : START') print('TEST (N={})'.format(len(test_df))) print(f'Saving predictions to {cfg["test"]["savefile"]} ...') def create_model(cfg): model = builder.build_model(cfg['model']['name'], cfg['model']['params']) print('Loading <{}> model from <{}> ...'.format(cfg['model']['name'], cfg['test']['checkpoint'])) model.load_state_dict(torch.load(cfg['test']['checkpoint'], map_location=lambda storage, loc: storage)) model = model.eval().cuda() return model model = create_model(cfg) if 'params' not in cfg['test'] or not isinstance(cfg['test']['params'], dict): cfg['test']['params'] = {} predictor = getattr(factory_evaluate, cfg['test']['predictor']) predictor = predictor(loader=loader, **cfg['test']['params']) if ARC: predictor.arc_loader = arc_loader y_true, y_pred, _ = predictor.predict(model, criterion=None, epoch=None) save_dir = osp.dirname(cfg['test']['savefile']) if not osp.exists(save_dir): os.makedirs(save_dir) pickled = { 'image_id': [_.split('/')[-1].split('.')[0] for _ in data_info['imgfiles']], 'y_pred': y_pred, 'y_true': y_true } with open(cfg['test']['savefile'], 'wb') as f: pickle.dump(pickled, f)
def setup(args, cfg, train_df, valid_df): logger = logging.getLogger('root') if isinstance(cfg['dataset']['data_dir'], list): data_dir_dict = { 2019: cfg['dataset']['data_dir'][0], 2020: cfg['dataset']['data_dir'][1] } if len(cfg['dataset']['data_dir']) == 3: data_dir_dict[2021] = cfg['dataset']['data_dir'][2] train_images = [] for rownum, row in train_df.iterrows(): data_dir = data_dir_dict[row.isic] imgfile = osp.join(data_dir, f'{row.image}.jpg') train_images += [imgfile] valid_images = [] for rownum, row in valid_df.iterrows(): data_dir = data_dir_dict[row.isic] imgfile = osp.join(data_dir, f'{row.image}.jpg') valid_images += [imgfile] else: train_images = [osp.join(cfg['dataset']['data_dir'], f'{_}.jpg') for _ in train_df['image'].values] valid_images = [osp.join(cfg['dataset']['data_dir'], f'{_}.jpg') for _ in valid_df['image'].values] train_data_info = { 'imgfiles': train_images, 'labels': train_df['label'].values } valid_data_info = { 'imgfiles': valid_images, 'labels': valid_df['label'].values } if cfg['dataset'].pop('meta', False): train_data_info['meta'] = [dict(age=row['age_cat'],sex=row['sex'],ant=row['anatom_site_general_challenge']) for rownum, row in train_df.iterrows()] valid_data_info['meta'] = [dict(age=row['age_cat'],sex=row['sex'],ant=row['anatom_site_general_challenge']) for rownum, row in valid_df.iterrows()] train_loader = builder.build_dataloader(cfg, data_info=train_data_info, mode='train') valid_loader = builder.build_dataloader(cfg, data_info=valid_data_info, mode='valid') ARC = False if cfg['model']['name'] in ('ArcNet', 'SiameseNet'): ARC = True if 'isic' in train_df.columns: mel_df = train_df[(train_df['label'] == 1) & (train_df['isic'] == 2020)] else: mel_df = train_df[train_df['label'] == 1] mel_df = mel_df.drop_duplicates() arc_data_info = { 'imgfiles': [osp.join(cfg['dataset']['data_dir'], f'{_}.jpg') for _ in mel_df['image'].values], 'labels': mel_df['label'].values } arc_loader = builder.build_dataloader(cfg, data_info=arc_data_info, mode='predict') print(f'{len(arc_loader)} melanoma examples will be used as reference ...') OHEM = False if 'ohem' in cfg['train']['params'] and cfg['train']['params']['ohem']: print('Creating benign loader ...') OHEM = True benign_df = train_df[train_df['label'] == 0] benign_data_info = { 'imgfiles': [osp.join(cfg['dataset']['data_dir'], f'{_}.jpg') for _ in benign_df['image'].values], 'labels': benign_df['label'].values } benign_loader = builder.build_dataloader(cfg, data_info=benign_data_info, mode='predict') # Adjust steps per epoch if necessary (i.e., equal to 0) # We assume if gradient accumulation is specified, then the user # has already adjusted the steps_per_epoch accordingly in the # config file steps_per_epoch = cfg['train']['params']['steps_per_epoch'] gradient_accmul = cfg['train']['params']['gradient_accumulation'] if steps_per_epoch == 0: cfg['train']['params']['steps_per_epoch'] = len(train_loader) # Generic build function will work for model/loss logger.info('Building [{}] architecture ...'.format(cfg['model']['name'])) if 'backbone' in cfg['model']['params'].keys(): logger.info(' Using [{}] backbone ...'.format(cfg['model']['params']['backbone'])) if 'pretrained' in cfg['model']['params'].keys(): logger.info(' Pretrained weights : {}'.format(cfg['model']['params']['pretrained'])) model = builder.build_model(cfg['model']['name'], cfg['model']['params']) model = model.train().cuda() if cfg['loss']['params'] is None: cfg['loss']['params'] = {} if re.search(r'^OHEM', cfg['loss']['name']): cfg['loss']['params']['total_steps'] = cfg['train']['params']['num_epochs'] * cfg['train']['params']['steps_per_epoch'] if cfg['loss']['name'] == 'CrossEntropyLoss': weighted = cfg['loss'].pop('weighted', False) if weighted: wts = get_invfreq_weights(train_data_info['labels'], scale=weighted) cfg['loss']['params']['weight'] = torch.tensor(wts) logger.info('Using the following class weights:') for i in range(len(wts)): logger.info(f' Class {i} : {wts[i]:.4f}') criterion = builder.build_loss(cfg['loss']['name'], cfg['loss']['params']) optimizer = builder.build_optimizer( cfg['optimizer']['name'], model.parameters(), cfg['optimizer']['params']) scheduler = builder.build_scheduler( cfg['scheduler']['name'], optimizer, cfg=cfg) if len(args.gpu) > 1: print(f'DEVICES : {args.gpu}') model = nn.DataParallel(model, device_ids=args.gpu) if args.gpu[0] != 0: model.to(f'cuda:{model.device_ids[0]}') if ARC: valid_loader = (valid_loader, arc_loader) if OHEM: train_loader = (train_loader, benign_loader) return cfg, \ train_loader, \ valid_loader, \ model, \ optimizer, \ criterion, \ scheduler
df = pd.read_csv('../data/combined_train_cdeotte_meta.csv') imgfiles = [f'../data/jpeg/train/{i}.jpg' for i in df['image_name']] imgfiles = [ '../data/jpeg/train/ISIC_0151200.jpg', '../data/jpeg/train/ISIC_0227038.jpg', '../data/jpeg/train/ISIC_0230209.jpg', '../data/jpeg/train/ISIC_0236778.jpg', '../data/jpeg/train/ISIC_0280749.jpg', '../data/jpeg/train/ISIC_0343061.jpg', '../data/jpeg/train/ISIC_0361529.jpg', '../data/jpeg/train/ISIC_0384214.jpg', '../data/jpeg/train/ISIC_0401250.jpg' ] labels = [0] * len(imgfiles) data_info = {'imgfiles': imgfiles, 'labels': labels} loader = build_dataloader(config, data_info, 'predict') loader.dataset.preprocessor = None loader = iter(loader) data = next(loader) data[0].shape images = data[0].numpy()[0, 1].transpose(1, 2, 0) images.shape cv2.imwrite('/home/ianpan/test_crop_tta.png', images) while data[0].size(0) == 1: data = next(loader)
from torch.utils.data import DataLoader from factory.data.datasets import FaceVideoDataset, PartSampler from factory.builder import build_dataloader with open('configs/experiments/experiment000.yaml') as f: cfg = yaml.load(f, Loader=yaml.FullLoader) cfg['transform']['preprocess'] = None df = pd.read_csv('../data/dfdc/jph/train_video_with_splits.csv') train_loader = build_dataloader( cfg, data_info={ 'vidfiles': [osp.join(cfg['dataset']['data_dir'], _) for _ in df['vidfile']], 'labels': list(df['label']) }, mode='train') valid_loader = build_dataloader( cfg, data_info={ 'vidfiles': [osp.join(cfg['dataset']['data_dir'], _) for _ in df['vidfile']], 'labels': list(df['label']) }, mode='valid') train_loader = iter(train_loader)
def predict_ensemble(args, cfg): df = pd.read_csv(cfg['predict']['csv_filename']) BATCH_SIZE = None if 'batch_size' in cfg['predict'].keys(): BATCH_SIZE = cfg['predict']['batch_size'] model_cfgs = [] for cfgfile in cfg['model_configs']: with open(cfgfile) as f: model_cfgs.append(yaml.load(f, Loader=yaml.FullLoader)) logger = logging.getLogger('root') logger.info('PREDICT : START') logger.info('PREDICT: n={}'.format(len(df))) images = [osp.join(cfg['predict']['data_dir'], _) for _ in df['imgfile']] male = list(df['male'].astype('float32')) if cfg['predict']['coords']: coords = {k : np.asarray(df[k]) for k in ['x1','y1','x2','y2']} else: coords = None loaders = [] models = [] for model_cfg in model_cfgs: model_cfg = set_inference_batch_size(model_cfg) if 'predict' not in model_cfg.keys(): model_cfg['predict'] = copy.deepcopy(model_cfg['test']) if BATCH_SIZE: model_cfg['predict']['batch_size'] = BATCH_SIZE loaders.append(builder.build_dataloader(model_cfg, data_info={'imgfiles': images, 'labels': [0]*len(images), 'male': male, 'coords': coords}, mode='predict')) model = builder.build_model(model_cfg['model']['name'], model_cfg['model']['params']) model.load_state_dict(torch.load(model_cfg['predict']['checkpoint'], map_location=lambda storage, loc: storage)) model = model.eval().cuda() models.append(model) for model_cfg in model_cfgs: if model_cfg['predict']['params'] is None: model_cfg['predict']['params'] = {} if 'patch' in model_cfg['evaluation']['params'].keys(): model_cfg['predict']['params']['patch'] = model_cfg['evaluation']['params']['patch'] predictors = [] for ind, model_cfg in enumerate(model_cfgs): predictor = getattr(factory.evaluate, model_cfg['predict']['predictor']) predictor = predictor(loader=loaders[ind], **model_cfg['predict']['params']) predictors.append(predictor) y_pred_list = [] for ind, model_cfg in enumerate(model_cfgs): _, y_pred, _ = predictors[ind].predict(models[ind], criterion=None, epoch=None) if 'percentile' in model_cfg['predict'].keys() and model_cfg['predict']['params']['patch']: y_pred = np.percentile(y_pred, model_cfg['predict']['percentile'], axis=1) y_pred_list.append(y_pred) y_pred = np.mean(np.asarray(y_pred_list), axis=0) if not osp.exists(cfg['predict']['save_preds_dir']): os.makedirs(cfg['predict']['save_preds_dir']) with open(osp.join(cfg['predict']['save_preds_dir'], 'predictions.pkl'), 'wb') as f: pickle.dump({ 'y_pred': y_pred, 'imgfiles': [im.split('/')[-1] for im in images] }, f)
def setup(args, cfg, train_df, valid_df): logger = logging.getLogger('root') train_images = [osp.join(cfg['dataset']['data_dir'], _) for _ in train_df['imgfile']] valid_images = [osp.join(cfg['dataset']['data_dir'], _) for _ in valid_df['imgfile']] train_labels = list(train_df['boneage']) valid_labels = list(valid_df['boneage']) train_male = list(train_df['male'].astype('float32')) valid_male = list(valid_df['male'].astype('float32')) if cfg['dataset']['coords']: train_coords = {k : np.asarray(train_df[k]) for k in ['x1','y1','x2','y2']} valid_coords = {k : np.asarray(valid_df[k]) for k in ['x1','y1','x2','y2']} else: train_coords = None valid_coords = None train_loader = builder.build_dataloader(cfg, data_info={'imgfiles': train_images, 'labels': train_labels, 'male': train_male, 'coords': train_coords}, mode='train') valid_loader = builder.build_dataloader(cfg, data_info={'imgfiles': valid_images, 'labels': valid_labels, 'male': valid_male, 'coords': valid_coords}, mode='valid') # Adjust steps per epoch if necessary (i.e., equal to 0) # We assume if gradient accumulation is specified, then the user # has already adjusted the steps_per_epoch accordingly in the # config file steps_per_epoch = cfg['train']['params']['steps_per_epoch'] gradient_accmul = cfg['train']['params']['gradient_accumulation'] if steps_per_epoch == 0: cfg['train']['params']['steps_per_epoch'] = len(train_loader) # Generic build function will work for model/loss logger.info('Building [{}] architecture ...'.format(cfg['model']['name'])) logger.info(' Using [{}] backbone ...'.format(cfg['model']['params']['backbone'])) logger.info(' Pretrained weights : {}'.format(cfg['model']['params']['pretrained'])) model = builder.build_model(cfg['model']['name'], cfg['model']['params']) model = model.train().cuda() if cfg['loss']['name'] == 'BalancedHybridLoss': strata_weights = pd.cut(train_df['boneage'], bins=[0,24]+list(np.arange(12*3, 12*17, 12))+[228], labels=range(16)) strata_weights = pd.DataFrame(strata_weights.value_counts()).reset_index().sort_values('index', ascending=True) strata_weights = strata_weights['boneage'].max() / strata_weights['boneage'] strata_weights = np.asarray(strata_weights) cfg['loss']['params']['strata_weights'] = strata_weights criterion = builder.build_loss(cfg['loss']['name'], cfg['loss']['params']) optimizer = builder.build_optimizer( cfg['optimizer']['name'], model.parameters(), cfg['optimizer']['params']) scheduler = builder.build_scheduler( cfg['scheduler']['name'], optimizer, cfg=cfg) return cfg, \ train_loader, \ valid_loader, \ model, \ optimizer, \ criterion, \ scheduler