def __init__(self, csv_path: Path, split: str, labeled=True, rand_number=543) -> None: super(ChexpertDataset, self).__init__() set_seed(rand_number) self.data_path = Path(csv_path).parent self.annotations = pd.read_csv(csv_path).fillna(0) self.train_annotations = None self.split = split self.transforms = None self.height, self.width = 224, 224 self.transforms = get_transforms(self.height, self.width, split=split if labeled else "val") if split == "train": assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size." self.annotations = self.annotations.sample(frac=1).reset_index( drop=True) if labeled: self.annotations = self.annotations[:cfg.DATA.LABELED_SIZE] else: self.annotations = self.annotations[ cfg.DATA.LABELED_SIZE:cfg.DATA.LABELED_SIZE + cfg.DATA.UNLABELED_SIZE].reset_index(drop=True)
def __init__(self, csv_path: Path, split: str) -> None: super(ChexpertDataset, self).__init__() self.data_path = Path(csv_path).parent self.annotations = pd.read_csv(csv_path).fillna(0) self.split = split self.transforms = None self.height, self.width = 64, 64 # 224, 224 self.transforms = get_transforms(self.height, self.width, split) if split == "train": assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size." labeled_size = cfg.DATA.LABELED_SIZE self.normal = self.annotations[ (self.annotations['Atelectasis'] == 0) & (self.annotations['Cardiomegaly'] == 0) & (self.annotations['Consolidation'] == 0) & (self.annotations['Edema'] == 0) & (self.annotations['Pleural Effusion'] == 0)] self.abnormal = self.annotations[ (self.annotations['Atelectasis'] != 0) | (self.annotations['Cardiomegaly'] != 0) | (self.annotations['Consolidation'] != 0) | (self.annotations['Edema'] != 0) | (self.annotations['Pleural Effusion'] != 0)] # self.normal = self.normal.sample(n=int(labeled_size / 2)).reset_index(drop=True) self.normal = self.normal.sample(n=int(labeled_size / 2)) self.abnormal = self.abnormal.sample(n=int(labeled_size / 2)) normal_indices = self.normal.index abnormal_indices = self.abnormal.index self.annotations = self.annotations.drop(normal_indices).drop( abnormal_indices) self.train_annotations = self.normal.append(self.abnormal, ignore_index=True)
def __init__(self, config, phase='train'): self.df = pd.read_csv(config.dataset[phase].csv_path) self.phase = phase self.transforms = get_transforms(config) self.augmentations = get_augmentations( config) if 'train' in phase else None self.config = config
def __init__(self, csv_path: Path, shuffled_annotations: pd.DataFrame) -> None: # shuffled_annotations are remaining annotations not used in labeled super(ChexpertDatasetUnlabeled, self).__init__() self.data_path = Path(csv_path).parent unlabeled_size = cfg.DATA.UNLABELED_SIZE self.annotations = shuffled_annotations[:unlabeled_size].reset_index( drop=True) self.height, self.width = 64, 64 # 224, 224 self.transforms = get_transforms(self.height, self.width, 'train')
def __init__(self, csv_path: Path, shuffled_annotations: pd.DataFrame, model) -> None: super(ChexpertDatasetUnlabeled, self).__init__() self.data_path = Path(csv_path).parent labeled_size = cfg.DATA.LABELED_SIZE unlabeled_size = cfg.DATA.UNLABELED_SIZE self.labeled = shuffled_annotations[:labeled_size] self.annotations = shuffled_annotations[labeled_size:labeled_size + unlabeled_size].reset_index(drop=True) self.S = [] self.height, self.width = 224, 224 self.transforms = get_transforms(self.height, self.width, 'train') self.model = model self.assign_nearest()
def __init__(self, csv_path: Path, split: str) -> None: super(ChexpertDataset, self).__init__() self.data_path = Path(csv_path).parent self.annotations = pd.read_csv(csv_path).fillna(0) self.train_annotations = None self.split = split self.transforms = None self.height, self.width = 64, 64 # 224, 224 self.transforms = get_transforms(self.height, self.width, split) if split == "train": assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size." self.annotations = self.annotations.sample(frac=1).reset_index( drop=True) self.train_annotations = self.annotations[:cfg.DATA.LABELED_SIZE]
def get_data_loader(configs): data_transforms = get_transforms(resize_size=256, crop_size=224) # build dataset train_dataset = datasets.ImageFolder(os.path.join(configs.data_path, 'train'), transform=data_transforms['train']) determin_train_dataset = datasets.ImageFolder( os.path.join(configs.data_path, 'train'), transform=data_transforms['val']) val_dataset = datasets.ImageFolder(os.path.join(configs.data_path, 'val'), transform=data_transforms['val']) test_datasets = { 'test' + str(i): datasets.ImageFolder(os.path.join(configs.data_path, 'test'), transform=data_transforms["test" + str(i)]) for i in range(10) } # build dataloader train_loader = DataLoader(train_dataset, batch_size=configs.batch_size, shuffle=True, num_workers=configs.num_workers, pin_memory=True) determin_train_loader = DataLoader(determin_train_dataset, batch_size=configs.batch_size, shuffle=False, num_workers=configs.num_workers, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=configs.batch_size, shuffle=False, num_workers=configs.num_workers, pin_memory=True) test_loaders = { 'test' + str(i): DataLoader(test_datasets["test" + str(i)], batch_size=4, shuffle=False, num_workers=configs.num_workers) for i in range(10) } return train_loader, determin_train_loader, val_loader, test_loaders
def __init__(self, csv_path: Path, unlabeled_pseudo: str, split="train", imb_type="exp", imb_factor=0.01, unlabel_imb_factor=1, rand_number=543): super(SemiSupervisedImbalanceChexpert, self).__init__() # unlabeled set_seed(rand_number) self.data_path = Path(csv_path).parent self.annotations = pd.read_csv(csv_path).fillna(0) self.split = split self.height, self.width = 224, 224 self.transforms = get_transforms(self.height, self.width, split) self.assign_labels() if split == "train": assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size." self.labeled_annotations = self.annotations.sample( frac=1).reset_index(drop=True) self.labeled_annotations = self.labeled_annotations[:cfg.DATA. LABELED_SIZE] self.unlabeled_annotations = self.annotations[ cfg.DATA.LABELED_SIZE:cfg.DATA.LABELED_SIZE + cfg.DATA.UNLABELED_SIZE].reset_index(drop=True) self.cls_num = 32 self.unlabel_size_factor = 5 self.unlabeled_pseudo = unlabeled_pseudo # pseudo-labels using model trained on imbalanced data self.imb_factor = imb_factor self.unlabel_imb_factor = unlabel_imb_factor self.num_per_cls_dict = dict() img_num_list = self.get_img_num_per_cls(self.cls_num, imb_type, imb_factor) img_num_list_unlabeled = self.get_img_num_per_cls_unlabeled( self.cls_num, img_num_list, unlabel_imb_factor) self.gen_imbalanced_data(img_num_list, img_num_list_unlabeled)
def __init__(self, csv_path: Path, split: str, imb_type="exp", imb_factor=0.01, rand_number=543): super(ImbalanceChexpert, self).__init__() set_seed(rand_number) self.data_path = Path(csv_path).parent self.annotations = pd.read_csv(csv_path).fillna(0) self.split = split self.height, self.width = 224, 224 self.transforms = get_transforms(self.height, self.width, split) self.assign_labels() if split == "train": assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size." self.annotations = self.annotations.sample(frac=1).reset_index( drop=True) self.annotations = self.annotations[:cfg.DATA. LABELED_SIZE] # split to be "labelled data" img_num_list = self.get_img_num_per_cls(32, imb_type, imb_factor) self.gen_imbalanced_data(img_num_list)
# target_column has unique values in set -1, 0, 1 # -1 corresponds to the unlabeled data df = pd.read_csv(args.csv) labeled = df[df[args.target_column] > -1] if args.ssl: print("Semi-supervised learning model is on...") unlabeled = df[df[args.target_column] == -1] # weights to initialize bias of FC layer of classifier weight = labeled.groupby(args.target_column).count()["path"] / labeled.shape[0] weight = torch.Tensor(weight.values).log() train_labeled, test_labeled = train_test_split(labeled, test_size=args.test_size, stratify=labeled[args.target_column], random_state=args.random_state) train_transform, valid_transform = get_transforms(img_size=args.image_size) train_labeled_loader, valid_labeled_loader = get_loaders( train_labeled, test_labeled, train_transform, valid_transform, target_column=args.target_column, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True ) if args.ssl: dataset_unlabeled = ImageDataset(unlabeled, train_transform, target_column=None) loss = LabelSmoothingLoss(num_classes=2, smoothing=0.2, weight=None)
type=str, default=None, help="position of check_point") parser.add_argument("--dataset", type=str, default="data/custom/test.txt", help="position of dataset") parser.add_argument( "--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation") parser.add_argument("--img_size", type=int, default=224, help="size of each image dimension") opt = parser.parse_args() print(opt) # load weights model = torch.load(opt.check_point) data_transforms = get_transforms() # load data dataset = ListDataSet("test", opt.dataset, data_transforms["test"]) testdata = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_cpu) print(dataset.__len__()) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") evaluation(model, device, testdata)
def __init__(self, cfg): super(ModelTaskLightning, self).__init__() assert cfg.lr_scheduler == 'poly', 'This implementation relies on an unconventional usage of _LRScheduler class' self.cfg = cfg dataset_class = resolve_dataset_class(cfg.dataset) self.dataset_train = dataset_class( cfg.datasets_dir, SPLIT_TRAIN, download=cfg.dataset_download, integrity_check=False ) self.dataset_valid = dataset_class( cfg.datasets_dir, SPLIT_VALID, download=cfg.dataset_download, integrity_check=False ) print('Number of samples in training split:', len(self.dataset_train)) print('Number of samples in validation split:', len(self.dataset_valid)) self.semseg_num_classes = self.dataset_train.num_classes self.semseg_ignore_class = self.dataset_train.ignore_label self.semseg_class_names = self.dataset_train.semseg_class_names model_class = resolve_network_model(cfg.model_name) self.net = model_class(cfg, self.semseg_num_classes) self.transforms_train = get_transforms( semseg_ignore_class=self.semseg_ignore_class, geom_scale_min=cfg.aug_geom_scale_min, geom_scale_max=cfg.aug_geom_scale_max, geom_tilt_max_deg=cfg.aug_geom_tilt_max_deg, geom_wiggle_max_ratio=cfg.aug_geom_wiggle_max_ratio, geom_reflect=cfg.aug_geom_reflect, crop_random=cfg.aug_input_crop_size, rgb_zero_mean_status=True, rgb_mean=self.dataset_train.rgb_mean, rgb_stddev=self.dataset_train.rgb_stddev, stroke_width=cfg.aug_semseg_weak_stroke_width, ) self.transforms_valid = get_transforms( semseg_ignore_class=self.semseg_ignore_class, crop_for_passable=self.net.bottleneck_stride if not cfg.aug_geom_validation_center_crop_sts else 0, crop_center=cfg.aug_geom_validation_center_crop_size if cfg.aug_geom_validation_center_crop_sts else 0, rgb_zero_mean_status=True, rgb_mean=self.dataset_train.rgb_mean, rgb_stddev=self.dataset_train.rgb_stddev, stroke_width=cfg.aug_semseg_weak_stroke_width, ) self.dataset_train.set_transforms(self.transforms_train) self.dataset_valid.set_transforms(self.transforms_valid) class_weights = None if cfg.loss_cross_entropy_class_weights_sts: class_weights = class_weights_from_histogram(self.dataset_train.semseg_class_histogram) self.loss_ce = torch.nn.CrossEntropyLoss(class_weights, ignore_index=self.semseg_ignore_class) self.loss_gatedcrf = None if cfg.loss_gatedcrf_sts: self.loss_gatedcrf = ModelLossSemsegGatedCRF() self.poly_lr_sched = None