예제 #1
0
    def __init__(
        self,
        image_filenames: List[str],
        mask_filenames: Optional[List[str]],
        transform: A.Compose,
        image_loader=read_inria_image,
        mask_loader=read_inria_mask,
        need_weight_mask=False,
        image_ids=None,
        make_mask_target_fn: Callable = mask_to_bce_target,
    ):
        if mask_filenames is not None and len(image_filenames) != len(
                mask_filenames):
            raise ValueError(
                "Number of images does not corresponds to number of targets")

        self.image_ids = [
            fs.id_from_fname(fname) for fname in image_filenames
        ] if image_ids is None else image_ids
        self.need_weight_mask = need_weight_mask

        self.images = image_filenames
        self.masks = mask_filenames
        self.get_image = image_loader
        self.get_mask = mask_loader

        self.transform = transform
        self.make_mask_target_fn = make_mask_target_fn
예제 #2
0
    def __init__(self,
                 image_filenames,
                 target_filenames,
                 image_loader,
                 target_loader,
                 transform=None,
                 keep_in_mem=False):
        if len(image_filenames) != len(target_filenames):
            raise ValueError(
                'Number of images does not corresponds to number of targets')

        self.image_ids = [id_from_fname(fname) for fname in image_filenames]

        if keep_in_mem:
            self.images = [image_loader(fname) for fname in image_filenames]
            self.masks = [target_loader(fname) for fname in target_filenames]
            self.get_image = lambda x: x
            self.get_loader = lambda x: x
        else:
            self.images = image_filenames
            self.masks = target_filenames
            self.get_image = image_loader
            self.get_loader = target_loader

        self.transform = transform
예제 #3
0
def test_pseudolabeling_aptos2015_round1(predictions, output_csv):
    print('Saving pseudolabels to ', output_csv)
    num_models = len(predictions)
    ids, x, y_true, y_average = prepare_inference_datasets(
        predictions, use_features=False, use_predictions=True)

    for i in range(num_models):
        print(
            fs.id_from_fname(predictions[i]),
            cohen_kappa_score(y_true,
                              regression_to_class(x[:, i]),
                              weights='quadratic'))

    y_round = to_numpy(regression_to_class(x))
    y_major = majority_voting(y_round, axis=1)

    y_agreement = y_round == np.expand_dims(y_major, -1)

    # y_agreement_all = np.all(y_agreement, axis=1)
    # y_agreement_all = np.sum(y_agreement, axis=1) >= 16
    y_agreement_all = y_major == y_true

    print('Agreement', np.mean(y_agreement_all))
    print('Distribution', np.bincount(y_major[y_agreement_all]))

    y_true[~y_agreement_all] = -100
    print(y_round)
    df = pd.DataFrame.from_dict({'id_code': ids, 'diagnosis': y_true})
    df.to_csv(output_csv, index=None)
예제 #4
0
def extract_and_save_dct_jpegio(fname, output_dir):
    # dct_y, dct_cr, dct_cb = compute_dct_fast(fname)

    image_id = fs.id_from_fname(fname) + ".npz"
    method = os.path.split(os.path.split(fname)[0])[1]
    dct_fname = os.path.join(output_dir, method, image_id)

    jpegStruct = jpio.read(fname)
    dct_matrix = jpegStruct.coef_arrays
    quant_tables = jpegStruct.quant_tables
    # ci0 = jpegStruct.comp_info[0]
    # ci1 = jpegStruct.comp_info[1]
    # ci2 = jpegStruct.comp_info[2]

    qm0 = np.tile(quant_tables[0], (512 // 8, 512 // 8))
    qm1 = np.tile(quant_tables[1], (512 // 8, 512 // 8))
    np.savez_compressed(
        dct_fname,
        dct_y=(dct_matrix[0] * qm0).astype(np.int16),
        dct_cb=(dct_matrix[1] * qm1).astype(np.int16),
        dct_cr=(dct_matrix[2] * qm1).astype(np.int16),
        qm0=quant_tables[0].astype(np.int16),
        qm1=quant_tables[1].astype(np.int16),
    )

    del jpegStruct
def cut_dataset_in_patches(data_dir, tile_size, tile_step, image_margin):

    train_data = []
    valid_data = []

    # For validation, we remove the first five images of every location (e.g., austin{1-5}.tif, chicago{1-5}.tif) from the training set.
    # That is suggested validation strategy by competition host
    for loc in TRAIN_LOCATIONS:
        for i in range(1, 6):
            valid_data.append(f"{loc}{i}")
        for i in range(6, 37):
            train_data.append(f"{loc}{i}")

    train_imgs = [os.path.join(data_dir, "train", "images", f"{fname}.tif") for fname in train_data]
    valid_imgs = [os.path.join(data_dir, "train", "images", f"{fname}.tif") for fname in valid_data]

    train_masks = [os.path.join(data_dir, "train", "gt", f"{fname}.tif") for fname in train_data]
    valid_masks = [os.path.join(data_dir, "train", "gt", f"{fname}.tif") for fname in valid_data]

    images_dir = os.path.join(data_dir, "train_tiles", "images")
    masks_dir = os.path.join(data_dir, "train_tiles", "gt")

    df = defaultdict(list)

    for train_img in tqdm(train_imgs, total=len(train_imgs), desc="train_imgs"):
        img_tiles = split_image(train_img, images_dir, tile_size, tile_step, image_margin)
        df["image"].extend(img_tiles)
        df["train"].extend([1] * len(img_tiles))
        df["image_id"].extend([fs.id_from_fname(train_img)] * len(img_tiles))

    for train_msk in tqdm(train_masks, total=len(train_masks), desc="train_masks"):
        msk_tiles = split_image(train_msk, masks_dir, tile_size, tile_step, image_margin)
        df["mask"].extend(msk_tiles)
        df["has_buildings"].extend([read_inria_mask(x).any() for x in msk_tiles])

    for valid_img in tqdm(valid_imgs, total=len(valid_imgs), desc="valid_imgs"):
        img_tiles = split_image(valid_img, images_dir, tile_size, tile_size, image_margin)
        df["image"].extend(img_tiles)
        df["train"].extend([0] * len(img_tiles))
        df["image_id"].extend([fs.id_from_fname(valid_img)] * len(img_tiles))

    for valid_msk in tqdm(valid_masks, total=len(valid_masks), desc="valid_masks"):
        msk_tiles = split_image(valid_msk, masks_dir, tile_size, tile_size, image_margin)
        df["mask"].extend(msk_tiles)
        df["has_buildings"].extend([read_inria_mask(x).any() for x in msk_tiles])

    return pd.DataFrame.from_dict(df)
def preprocess(image_fname, output_dir, image_size=768):
    image = cv2.imread(image_fname)
    image = crop_black(image, tolerance=5)
    image = longest_max_size(image, max_size=image_size, interpolation=cv2.INTER_CUBIC)

    image_id = fs.id_from_fname(image_fname)
    dst_fname = os.path.join(output_dir, image_id + '.png')
    cv2.imwrite(dst_fname, image)
    return
예제 #7
0
 def sanitize_fname(x):
     x = fs.id_from_fname(x)
     x = (x.replace("fp16", "").replace("fold", "f").replace(
         "local_rank_0",
         "").replace("nr_rgb_tf_efficientnet_b6_ns",
                     "").replace("rgb_tf_efficientnet_b2_ns", "").replace(
                         "rgb_tf_efficientnet_b3_ns", "").replace(
                             "rgb_tf_efficientnet_b6_ns",
                             "").replace("rgb_tf_efficientnet_b7_ns", ""))
     x = re.sub(r"\w{3}\d{2}_\d{2}_\d{2}", "", x).replace("_", "")
     return x
예제 #8
0
def test_evaluate_model(predictions):
    num_models = len(predictions)
    ids, x, y_true, y_average = prepare_inference_datasets(
        predictions, use_features=False, use_predictions=True)

    for i in range(num_models):
        print(
            fs.id_from_fname(predictions[i]),
            cohen_kappa_score(y_true,
                              regression_to_class(x),
                              weights='quadratic'))
예제 #9
0
    def __getitem__(self, i):
        # read data
        image = fs.read_rgb_image(self.images_fps[i])
        mask = fs.read_image_as_is(self.masks_fps[i])
        assert mask.max() < len(CLASSES)

        # apply augmentations
        sample = self.transform(image=image, mask=mask)
        image, mask = sample['image'], sample['mask']

        return {
            "image_id": id_from_fname(self.images_fps[i]),
            "features": tensor_from_rgb_image(image),
            "targets": torch.from_numpy(mask).long()
        }
예제 #10
0
def split_image(image_fname, output_dir, tile_size, tile_step, image_margin):
    os.makedirs(output_dir, exist_ok=True)
    image = read_image_as_is(image_fname)
    image_id = id_from_fname(image_fname)

    slicer = ImageSlicer(image.shape, tile_size, tile_step, image_margin)
    tiles = slicer.split(image)

    fnames = []
    for i, tile in enumerate(tiles):
        output_fname = os.path.join(output_dir, f"{image_id}_tile_{i}.png")
        cv2.imwrite(output_fname, tile)
        fnames.append(output_fname)

    return fnames
예제 #11
0
def cut_test_dataset_in_patches(data_dir, tile_size, tile_step, image_margin):
    train_imgs = fs.find_images_in_dir(os.path.join(data_dir, "test",
                                                    "images"))

    images_dir = os.path.join(data_dir, "test_tiles", "images")

    df = defaultdict(list)

    for train_img in tqdm(train_imgs, total=len(train_imgs), desc="test_imgs"):
        img_tiles = split_image(train_img, images_dir, tile_size, tile_step,
                                image_margin)
        df["image"].extend(img_tiles)
        df["image_id"].extend([fs.id_from_fname(train_img)] * len(img_tiles))

    return pd.DataFrame.from_dict(df)
예제 #12
0
    def __init__(
        self,
        image_fname: str,
        mask_fname: str,
        image_loader: Callable,
        target_loader: Callable,
        tile_size,
        tile_step,
        image_margin=0,
        transform=None,
        target_shape=None,
        keep_in_mem=False,
    ):
        self.image_fname = image_fname
        self.mask_fname = mask_fname
        self.image_loader = image_loader
        self.mask_loader = target_loader
        self.image = None
        self.mask = None

        if target_shape is None or keep_in_mem:
            image = image_loader(image_fname)
            mask = target_loader(mask_fname)
            if image.shape[0] != mask.shape[0] or image.shape[1] != mask.shape[
                    1]:
                raise ValueError(
                    f"Image size {image.shape} and mask shape {image.shape} must have equal width and height"
                )

            target_shape = image.shape

        self.slicer = ImageSlicer(target_shape, tile_size, tile_step,
                                  image_margin)

        if keep_in_mem:
            self.images = self.slicer.split(image)
            self.masks = self.slicer.split(mask)
        else:
            self.images = None
            self.masks = None

        self.transform = transform
        self.image_ids = [
            id_from_fname(image_fname) +
            f" [{crop[0]};{crop[1]};{crop[2]};{crop[3]};]"
            for crop in self.slicer.crops
        ]
예제 #13
0
    def __init__(
        self,
        image_fname: str,
        mask_fname: str,
        image_loader: Callable,
        target_loader: Callable,
        tile_size,
        tile_step,
        image_margin=0,
        transform=None,
        target_shape=None,
        need_weight_mask=False,
        keep_in_mem=False,
        make_mask_target_fn: Callable = mask_to_bce_target,
    ):
        self.image_fname = image_fname
        self.mask_fname = mask_fname
        self.image_loader = image_loader
        self.mask_loader = target_loader
        self.image = None
        self.mask = None
        self.need_weight_mask = need_weight_mask

        if target_shape is None or keep_in_mem:
            image = image_loader(image_fname)
            mask = target_loader(mask_fname)
            if image.shape[0] != mask.shape[0] or image.shape[1] != mask.shape[
                    1]:
                raise ValueError(
                    f"Image size {image.shape} and mask shape {image.shape} must have equal width and height"
                )

            target_shape = image.shape

        self.slicer = ImageSlicer(target_shape, tile_size, tile_step,
                                  image_margin)

        self.transform = transform
        self.image_ids = [fs.id_from_fname(image_fname)] * len(
            self.slicer.crops)
        self.crop_coords_str = [
            f"[{crop[0]};{crop[1]};{crop[2]};{crop[3]};]"
            for crop in self.slicer.crops
        ]
        self.make_mask_target_fn = make_mask_target_fn
def get_pseudolabeling_dataset(data_dir: str,
                               include_masks: bool,
                               image_size=(224, 224),
                               augmentation=None,
                               need_weight_mask=False):
    images = fs.find_images_in_dir(
        os.path.join(data_dir, "test_tiles", "images"))

    masks_dir = os.path.join(data_dir, "test_tiles", "masks")
    os.makedirs(masks_dir, exist_ok=True)

    masks = [
        os.path.join(masks_dir,
                     fs.id_from_fname(image_fname) + ".png")
        for image_fname in images
    ]

    if augmentation == "hard":
        transfrom = A.Compose(
            [crop_transform(image_size, input_size=768),
             hard_augmentations()])
    elif augmentation == "medium":
        transfrom = A.Compose([
            crop_transform(image_size, input_size=768),
            medium_augmentations()
        ])
    elif augmentation == "light":
        transfrom = A.Compose([
            crop_transform(image_size, input_size=768),
            light_augmentations()
        ])
    else:
        transfrom = A.Normalize()

    return InriaImageMaskDataset(
        images,
        masks if include_masks else None,
        transform=transfrom,
        image_loader=read_inria_image,
        mask_loader=read_inria_mask_with_pseudolabel,
        need_weight_mask=need_weight_mask,
    )
예제 #15
0
    def __getitem__(self, item):
        image = cv2.imread(self.images[item])  # Read with OpenCV instead PIL. It's faster
        if image is None:
            raise FileNotFoundError(self.images[item])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        height, width = image.shape[:2]
        diagnosis = UNLABELED_CLASS
        if self.targets is not None:
            diagnosis = self.targets[item]

        data = self.transform(image=image, diagnosis=diagnosis)
        diagnosis = data['diagnosis']
        data = {'image': tensor_from_rgb_image(data['image']),
                'image_id': id_from_fname(self.images[item])}

        if self.meta_features:
            log_height = math.log(height)
            log_width = math.log(width)
            aspect_ratio = log_height / log_width
            mean = np.mean(image, axis=(0, 1))

            meta_features = np.array([
                log_height,
                log_width,
                aspect_ratio,
                mean[0],
                mean[1],
                mean[2]
            ])
            data['meta_features'] = meta_features

        diagnosis = self.dtype(diagnosis)
        if self.target_as_array:
            data['targets'] = np.array([diagnosis])
        else:
            data['targets'] = diagnosis

        return data
예제 #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2"))

    args = parser.parse_args()
    data_dir = args.data_dir

    cover = os.path.join(data_dir, "Cover")
    JMiPOD = os.path.join(data_dir, "JMiPOD")
    JUNIWARD = os.path.join(data_dir, "JUNIWARD")
    UERD = os.path.join(data_dir, "UERD")

    dataset = (
        fs.find_images_in_dir(cover)
        + fs.find_images_in_dir(JMiPOD)
        + fs.find_images_in_dir(JUNIWARD)
        + fs.find_images_in_dir(UERD)
    )
    # dataset = dataset[:500]
    df = defaultdict(list)
    for image_fname in tqdm(dataset):
        target = target_from_fname(image_fname)
        dct_fname = fs.change_extension(image_fname, ".npz")
        dct_data = np.load(dct_fname)
        qm0 = dct_data["qm0"]
        qm1 = dct_data["qm1"]
        qf = quality_factror_from_qm(qm0)
        fsize = os.stat(image_fname).st_size

        df["image_id"].append(fs.id_from_fname(image_fname))
        df["target"].append(target)
        df["quality"].append(qf)
        df["qm0"].append(qm0.flatten().tolist())
        df["qm1"].append(qm1.flatten().tolist())
        df["file_size"].append(fsize)

    df = pd.DataFrame.from_dict(df)
    df.to_csv("dataset_qf_qt.csv", index=False)
예제 #17
0
    def __getitem__(self, item):
        image = cv2.imread(self.images[item])  # Read with OpenCV instead PIL. It's faster
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        height, width = image.shape[:2]

        original = self.normalize(image=image)['image']
        transformed = self.transform(image=image)['image']

        data = {'image': tensor_from_rgb_image(transformed),
                'original': tensor_from_rgb_image(original),
                'image_id': id_from_fname(self.images[item])}

        if self.meta_features:
            log_height = math.log(height)
            log_width = math.log(width)
            aspect_ratio = log_height / log_width
            mean = np.mean(image, axis=(0, 1))

            meta_features = np.array([
                log_height,
                log_width,
                aspect_ratio,
                mean[0],
                mean[1],
                mean[2]
            ])

            data['meta_features'] = meta_features

        if self.targets is not None:
            target = self.dtype(self.targets[item])
            if self.target_as_array:
                data['targets'] = np.array([target])
            else:
                data['targets'] = target

        return data
예제 #18
0
def test_evaluate_model_v2(train, validation):
    num_models = len(train)
    ids, train_x, train_y_true, train_y_average = prepare_inference_datasets(
        train, use_features=False, use_predictions=True)

    ids, valid_x, valid_y_true, valid_y_average = prepare_inference_datasets(
        validation, use_features=False, use_predictions=True)

    for i in range(num_models):
        print(
            fs.id_from_fname(train[i]),
            cohen_kappa_score(train_y_true,
                              regression_to_class(train_x[:, i]),
                              weights='quadratic'),
            cohen_kappa_score(train_y_true,
                              regression_to_class(valid_x[:, i]),
                              weights='quadratic'),
        )

    print(
        'Averaged',
        cohen_kappa_score(train_y_true,
                          regression_to_class(train_y_average),
                          weights='quadratic'),
        cohen_kappa_score(valid_y_true,
                          regression_to_class(valid_y_average),
                          weights='quadratic'))

    print(
        'Median  ',
        cohen_kappa_score(train_y_true,
                          regression_to_class(np.median(train_x, axis=1)),
                          weights='quadratic'),
        cohen_kappa_score(valid_y_true,
                          regression_to_class(np.median(valid_x, axis=1)),
                          weights='quadratic'))

    print(
        'TrimMean',
        cohen_kappa_score(train_y_true,
                          regression_to_class(
                              trim_mean(train_x, proportiontocut=0.1, axis=1)),
                          weights='quadratic'),
        cohen_kappa_score(valid_y_true,
                          regression_to_class(
                              trim_mean(valid_x, proportiontocut=0.1, axis=1)),
                          weights='quadratic'))

    rounder = OptimizedRounder()
    rounder.fit(train_y_average, train_y_true)

    print(rounder.coefficients())
    print(
        'Optimized',
        cohen_kappa_score(train_y_true,
                          rounder.predict(train_y_average,
                                          rounder.coefficients()),
                          weights='quadratic'),
        cohen_kappa_score(valid_y_true,
                          rounder.predict(valid_y_average,
                                          rounder.coefficients()),
                          weights='quadratic'))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("checkpoints", nargs="+")
    parser.add_argument("-w", "--workers", type=int, default=1, help="")
    parser.add_argument("-dd", "--data-dir", type=str, default="data", help="Data directory")
    parser.add_argument("-a", "--activation", type=str, default="pre", help="")
    args = parser.parse_args()

    targets = fs.find_in_dir(os.path.join(args.data_dir, "tier3", "masks")) + fs.find_in_dir(
        os.path.join(args.data_dir, "train", "masks")
    )
    targets_post = dict((fs.id_from_fname(fname), fname) for fname in targets if "_post_" in fname)

    df = defaultdict(list)

    current_time = datetime.now().strftime("%b%d_%H_%M")

    print("Checkpoints ", args.checkpoints)
    print("Activation  ", args.activation)

    for model_checkpoint in args.checkpoints:
        model_checkpoint = fs.auto_file(model_checkpoint)
        predictions_dir = os.path.join(
            os.path.dirname(model_checkpoint), fs.id_from_fname(model_checkpoint) + "_oof_predictions"
        )

        prediction_files = fs.find_in_dir(predictions_dir)
        prediction_files_post = dict(
            (fs.id_from_fname(fname), fname) for fname in prediction_files if "_post_" in fname
        )

        y_true_filenames = [targets_post[image_id_post] for image_id_post in prediction_files_post.keys()]
        y_pred_filenames = [prediction_files_post[image_id_post] for image_id_post in prediction_files_post.keys()]

        rounder = OptimizedRounder(workers=args.workers, apply_softmax=args.activation)

        raw_score, raw_localization_f1, raw_damage_f1, raw_damage_f1s = rounder.predict(
            y_pred_filenames, y_true_filenames, np.array([1, 1, 1, 1, 1], dtype=np.float32)
        )

        rounder.fit(y_pred_filenames, y_true_filenames)

        score, localization_f1, damage_f1, damage_f1s = rounder.predict(
            y_pred_filenames, y_true_filenames, rounder.coefficients()
        )

        print(rounder.coefficients())

        df["checkpoint"].append(fs.id_from_fname(model_checkpoint))
        df["coefficients"].append(rounder.coefficients())
        df["samples"].append(len(y_true_filenames))

        df["raw_score"].append(raw_score)
        df["raw_localization"].append(raw_localization_f1)
        df["raw_damage"].append(raw_damage_f1)

        df["opt_score"].append(score)
        df["opt_localization"].append(localization_f1)
        df["opt_damage"].append(damage_f1)

        dataframe = pd.DataFrame.from_dict(df)
        dataframe.to_csv(f"optimized_weights_{current_time}.csv", index=None)
        print(df)
예제 #20
0
def convert_dir(df: pd.DataFrame, dir) -> pd.DataFrame:
    crops_dir = os.path.join(dir, "crops")
    os.makedirs(crops_dir, exist_ok=True)

    building_crops = []

    global_crop_index = 0

    for i, row in tqdm(df.iterrows(), total=len(df)):
        image_fname_pre = read_image(os.path.join(dir, row["image_fname_pre"]))
        image_fname_post = read_image(
            os.path.join(dir, row["image_fname_post"]))

        mask_fname_post = row["mask_fname_post"]
        json_fname_post = fs.change_extension(
            mask_fname_post.replace("masks", "labels"), ".json")
        inference_data = open_json(os.path.join(dir, json_fname_post))
        instance_image, labels = create_instance_image(inference_data)

        for label_index, damage_label in zip(
                range(1,
                      instance_image.max() + 1), labels):
            try:
                instance_mask = instance_image == label_index
                rmin, rmax, cmin, cmax = bbox1(instance_mask)

                max_size = max(rmax - rmin, cmax - cmin)
                if max_size < 16:
                    print("Skipping crop since it's too small",
                          fs.id_from_fname(mask_fname_post), "label_index",
                          label_index, "min_size", max_size)
                    continue

                rpadding = (rmax - rmin) // 4
                cpadding = (cmax - cmin) // 4

                pre_crop = image_fname_pre[max(0, rmin -
                                               rpadding):rmax + rpadding,
                                           max(0, cmin - cpadding):cmax +
                                           cpadding]
                post_crop = image_fname_post[max(0, rmin -
                                                 rpadding):rmax + rpadding,
                                             max(0, cmin - cpadding):cmax +
                                             cpadding]

                image_id_pre = row["image_id_pre"]
                image_id_post = row["image_id_post"]

                pre_crop_fname = f"{global_crop_index:06}_{image_id_pre}.png"
                post_crop_fname = f"{global_crop_index:06}_{image_id_post}.png"
                global_crop_index += 1

                cv2.imwrite(os.path.join(crops_dir, pre_crop_fname), pre_crop)
                cv2.imwrite(os.path.join(crops_dir, post_crop_fname),
                            post_crop)

                building_crops.append({
                    "pre_crop_fname": pre_crop_fname,
                    "post_crop": post_crop_fname,
                    "label": damage_label,
                    "event_name": row["event_name_post"],
                    "fold": row["fold_post"],
                    "rmin": rmin,
                    "rmax": rmax,
                    "cmin": cmin,
                    "cmax": cmax,
                    "max_size": max_size,
                    "rpadding": rpadding,
                    "cpadding": cpadding
                })
            except Exception as e:
                print(e)
                print(mask_fname_post)

    df = pd.DataFrame.from_records(building_crops)
    return df
예제 #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("experiments", nargs="+", type=str)
    parser.add_argument("-o", "--output", type=str, required=False)
    parser.add_argument("-dd",
                        "--data-dir",
                        type=str,
                        default=os.environ.get("KAGGLE_2020_ALASKA2"))
    args = parser.parse_args()

    output_dir = os.path.dirname(__file__)
    data_dir = args.data_dir
    experiments = args.experiments
    output_file = args.output

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids_h = [fs.id_from_fname(x) for x in holdout_ds.images]
    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    with_logits = True
    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=with_logits,
                                tta_logits=with_logits)
    # Force target to be binary
    y = (y > 0).astype(int)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=with_logits,
                                     tta_logits=with_logits)
    print(x_test.shape)

    if False:
        image_fnames_h = [
            os.path.join(data_dir, INDEX_TO_METHOD[method], f"{image_id}.jpg")
            for (image_id, method) in zip(image_ids_h, y)
        ]
        test_image_ids = pd.read_csv(test_predictions[0]).image_id.tolist()
        image_fnames_t = [
            os.path.join(data_dir, "Test", image_id)
            for image_id in test_image_ids
        ]

        entropy_t = compute_image_features(image_fnames_t)
        x_test = np.column_stack([x_test, entropy_t])

        # entropy_h = entropy_t.copy()
        # x = x_test.copy()

        entropy_h = compute_image_features(image_fnames_h)
        x = np.column_stack([x, entropy_h])
        print("Added image features", entropy_h.shape, entropy_t.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    params = {
        "min_child_weight": [1, 5, 10],
        "gamma": [1e-3, 1e-2, 1e-2, 0.5, 2],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "max_depth": [2, 3, 4, 5, 6],
        "n_estimators": [16, 32, 64, 128, 256, 1000],
        "learning_rate": [0.001, 0.01, 0.05, 0.2, 1],
    }

    xgb = XGBClassifier(objective="binary:logistic", nthread=1)

    random_search = RandomizedSearchCV(
        xgb,
        param_distributions=params,
        scoring=make_scorer(alaska_weighted_auc,
                            greater_is_better=True,
                            needs_proba=True),
        n_jobs=4,
        n_iter=25,
        cv=group_kfold.split(x, y, groups=image_ids_h),
        verbose=3,
        random_state=42,
    )

    # Here we go
    random_search.fit(x, y)

    print("\n All results:")
    print(random_search.cv_results_)
    print("\n Best estimator:")
    print(random_search.best_estimator_)
    print(random_search.best_score_)
    print("\n Best hyperparameters:")
    print(random_search.best_params_)
    results = pd.DataFrame(random_search.cv_results_)
    results.to_csv("xgb-random-grid-search-results-01.csv", index=False)

    test_pred = random_search.predict_proba(x_test)[:, 1]

    if output_file is None:
        with_logits_sfx = "_with_logits" if with_logits else ""
        submit_fname = os.path.join(
            output_dir,
            f"xgb_cls_gs_{random_search.best_score_:.4f}_{checksum}{with_logits_sfx}.csv"
        )
    else:
        submit_fname = output_file

    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)

    import json

    with open(fs.change_extension(submit_fname, ".json"), "w") as f:
        json.dump(random_search.best_params_, f, indent=2)
예제 #22
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    fnames_for_checksum = [x + f"cauc" for x in experiments]
    checksum = compute_checksum_v2(fnames_for_checksum)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]
    print("Unique image ids", len(np.unique(image_ids)))
    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y(holdout_predictions)
    print(x.shape, y.shape)

    x_test, _ = get_x_y(test_predictions)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    test_dmatrix = xgb.DMatrix(x_test)

    group_kfold = GroupKFold(n_splits=5)
    cv_scores = []
    test_pred = None
    one_over_n = 1.0 / group_kfold.n_splits

    params = {
        "base_score": 0.5,
        "booster": "gblinear",
        # "booster": "gbtree",
        "colsample_bylevel": 1,
        "colsample_bynode": 1,
        "colsample_bytree": 1,
        # "gamma": 1.0,
        "learning_rate": 0.01,
        "max_delta_step": 0,
        "objective": "binary:logistic",
        "eta": 0.1,
        "reg_lambda": 0,
        "subsample": 0.8,
        "scale_pos_weight": 1,
        "min_child_weight": 2,
        "max_depth": 5,
        "tree_method": "exact",
        "seed": 42,
        "alpha": 0.01,
        "lambda": 0.01,
        "n_estimators": 256,
        "gamma": 0.01,
        "disable_default_eval_metric": 1,
        # "eval_metric": "wauc",
    }

    for fold_index, (train_index, valid_index) in enumerate(
            group_kfold.split(x, y, groups=image_ids)):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])

        train_dmatrix = xgb.DMatrix(x_train.copy(), y_train.copy())
        valid_dmatrix = xgb.DMatrix(x_valid.copy(), y_valid.copy())

        xgb_model = xgb.train(
            params,
            train_dmatrix,
            num_boost_round=5000,
            verbose_eval=True,
            feval=xgb_weighted_auc,
            maximize=True,
            evals=[(valid_dmatrix, "validation")],
        )

        y_valid_pred = xgb_model.predict(valid_dmatrix)
        score = alaska_weighted_auc(y_valid, y_valid_pred)

        cv_scores.append(score)

        if test_pred is not None:
            test_pred += xgb_model.predict(test_dmatrix) * one_over_n
        else:
            test_pred = xgb_model.predict(test_dmatrix) * one_over_n

    for s in cv_scores:
        print(s)
    print(np.mean(cv_scores), np.std(cv_scores))

    submit_fname = os.path.join(
        output_dir, f"xgb_{np.mean(cv_scores):.4f}_{checksum}_.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
예제 #23
0
def evaluate_generalization(checkpoints, num_folds=4):
    num_datasets = len(checkpoints)
    # kappa_matrix = np.zeros((num_datasets, num_datasets), dtype=np.float32)
    class_names = list(checkpoints.keys())

    # results = {}

    for dataset_trained_on, checkpoints_per_fold in checkpoints.items():
        # For each dataset trained on

        for fold_trained_on, checkpoint_file in enumerate(
                checkpoints_per_fold):
            # For each checkpoint
            if checkpoint_file is None:
                continue

            # Load model
            checkpoint = torch.load(checkpoint_file)
            model_name = checkpoint['checkpoint_data']['cmd_args']['model']
            batch_size = 16  # checkpoint['checkpoint_data']['cmd_args']['batch_size']
            num_classes = len(get_class_names())
            model = get_model(model_name,
                              pretrained=False,
                              num_classes=num_classes)
            model.load_state_dict(checkpoint['model_state_dict'])
            model = model.eval().cuda()
            if torch.cuda.device_count() > 1:
                model = nn.DataParallel(
                    model,
                    device_ids=[id for id in range(torch.cuda.device_count())])

            for dataset_index, dataset_validate_on in enumerate(class_names):
                # For each available dataset

                for fold_validate_on in range(num_folds):
                    _, valid_ds, _ = get_datasets(
                        use_aptos2015=dataset_validate_on == 'aptos2015',
                        use_aptos2019=dataset_validate_on == 'aptos2019',
                        use_messidor=dataset_validate_on == 'messidor',
                        use_idrid=dataset_validate_on == 'idrid',
                        fold=fold_validate_on,
                        folds=num_folds)

                    data_loader = DataLoader(valid_ds,
                                             batch_size *
                                             torch.cuda.device_count(),
                                             pin_memory=True,
                                             num_workers=8)

                    predictions = defaultdict(list)
                    for batch in tqdm(
                            data_loader,
                            desc=
                            f'Evaluating {dataset_validate_on} fold {fold_validate_on} on {checkpoint_file}'
                    ):
                        input = batch['image'].cuda(non_blocking=True)
                        outputs = model(input)
                        logits = to_numpy(outputs['logits'].softmax(dim=1))
                        regression = to_numpy(outputs['regression'])
                        features = to_numpy(outputs['features'])

                        predictions['image_id'].extend(batch['image_id'])
                        predictions['diagnosis_true'].extend(
                            to_numpy(batch['targets']))
                        predictions['logits'].extend(logits)
                        predictions['regression'].extend(regression)
                        predictions['features'].extend(features)

                    pickle_name = id_from_fname(
                        checkpoint_file
                    ) + f'_on_{dataset_validate_on}_fold{fold_validate_on}.pkl'

                    df = pd.DataFrame.from_dict(predictions)
                    df.to_pickle(pickle_name)
예제 #24
0
def model_from_checkpoint(model_checkpoint: str,
                          tta: Optional[str] = None,
                          activation_after="model",
                          model=None,
                          report=True,
                          classifiers=True) -> Tuple[nn.Module, Dict]:
    checkpoint = torch.load(model_checkpoint, map_location="cpu")
    model_name = model or checkpoint["checkpoint_data"]["cmd_args"]["model"]

    score = float(checkpoint["epoch_metrics"]["valid"]["weighted_f1"])
    loc = float(
        checkpoint["epoch_metrics"]["valid"]["weighted_f1/localization_f1"])
    dmg = float(checkpoint["epoch_metrics"]["valid"]["weighted_f1/damage_f1"])
    fold = int(checkpoint["checkpoint_data"]["cmd_args"]["fold"])

    if report:
        print(model_checkpoint, model_name)
        report_checkpoint(checkpoint)

    model = get_model(model_name, pretrained=False, classifiers=classifiers)

    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
    del checkpoint

    if activation_after == "model":
        model = ApplySoftmaxTo(model, OUTPUT_MASK_KEY)

    if tta == "multiscale":
        print(f"Using {tta}")
        model = MultiscaleTTA(model,
                              outputs=[OUTPUT_MASK_KEY],
                              size_offsets=[-256, -128, +128, +256],
                              average=True)

    if tta == "flip":
        print(f"Using {tta}")
        model = HFlipTTA(model, outputs=[OUTPUT_MASK_KEY], average=True)

    if tta == "flipscale":
        print(f"Using {tta}")
        model = HFlipTTA(model, outputs=[OUTPUT_MASK_KEY], average=True)
        model = MultiscaleTTA(model,
                              outputs=[OUTPUT_MASK_KEY],
                              size_offsets=[-256, -128, +128, +256],
                              average=True)

    if tta == "multiscale_d4":
        print(f"Using {tta}")
        model = D4TTA(model, outputs=[OUTPUT_MASK_KEY], average=True)
        model = MultiscaleTTA(model,
                              outputs=[OUTPUT_MASK_KEY],
                              size_offsets=[-256, -128, +128, +256],
                              average=True)

    if activation_after == "tta":
        model = ApplySoftmaxTo(model, OUTPUT_MASK_KEY)

    info = {
        "model": fs.id_from_fname(model_checkpoint),
        "model_name": model_name,
        "fold": fold,
        "score": score,
        "localization": loc,
        "damage": dmg,
    }
    return model, info
예제 #25
0
import os
import numpy as np
from pytorch_toolbelt.utils import fs
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
import pandas as pd

submissions = [x for x in fs.find_in_dir(".") if str.endswith(x, ".csv")]
names = list(map(lambda x: fs.id_from_fname(x)[:32], submissions))
submissions = [pd.read_csv(x).sort_values(by="Id").reset_index() for x in submissions]

cm = np.zeros((len(submissions), len(submissions)))
for i in range(len(submissions)):
    for j in range(len(submissions)):
        cm[i, j] = spearmanr(submissions[i].Label, submissions[j].Label).correlation

print(cm)

plt.figure(figsize=(10 + len(submissions), 10 + len(submissions)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=names)
disp.plot(include_values=True, cmap="Blues", ax=plt.gca(), xticks_rotation="45")
plt.tight_layout()
plt.show()
예제 #26
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=True,
                                tta_logits=True)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=True,
                                     tta_logits=True)
    print(x_test.shape)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    for fold_index, (train_index, valid_index) in enumerate(
            group_kfold.split(x, y, groups=image_ids)):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])

        clf = LazyClassifier(verbose=True,
                             ignore_warnings=False,
                             custom_metric=alaska_weighted_auc,
                             predictions=True)
        models, predictions = clf.fit(x_train, x_valid, y_train, y_valid)
        print(models)

        models.to_csv(
            os.path.join(output_dir,
                         f"lazypredict_models_{fold_index}_{checksum}.csv"))
        predictions.to_csv(
            os.path.join(output_dir,
                         f"lazypredict_preds_{fold_index}_{checksum}.csv"))
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=True,
                                tta_logits=True)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=True,
                                     tta_logits=True)
    print(x_test.shape)

    if False:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)
    cv_scores = []
    test_pred = None
    one_over_n = 1.0 / group_kfold.n_splits

    for train_index, valid_index in group_kfold.split(x, y, groups=image_ids):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])
        print(np.bincount(y_train), np.bincount(y_valid))

        # cls = LinearDiscriminantAnalysis()
        cls = LinearDiscriminantAnalysis(solver="lsqr",
                                         shrinkage="auto",
                                         priors=[0.5, 0.5])
        cls.fit(x_train, y_train)

        y_valid_pred = cls.predict_proba(x_valid)[:, 1]
        score = alaska_weighted_auc(y_valid, y_valid_pred)
        cv_scores.append(score)

        if test_pred is not None:
            test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n
        else:
            test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n

    for s in cv_scores:
        print(s)
    print(np.mean(cv_scores), np.std(cv_scores))

    submit_fname = os.path.join(
        output_dir, f"lda_{np.mean(cv_scores):.4f}_{checksum}.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        #
        "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16",
        "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16"
        #
        #
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    with_logits = True

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=with_logits,
                                tta_logits=with_logits)
    # Force target to be binary
    y = (y > 0).astype(int)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=with_logits,
                                     tta_logits=with_logits)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)
    cv_scores = []
    test_pred = None
    one_over_n = 1.0 / group_kfold.n_splits

    for train_index, valid_index in group_kfold.split(x, y, groups=image_ids):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])
        print(np.bincount(y_train), np.bincount(y_valid))

        cls = XGBClassifier(
            base_score=0.5,
            booster="gbtree",
            colsample_bylevel=1,
            colsample_bynode=1,
            colsample_bytree=0.6,
            gamma=0.5,
            gpu_id=-1,
            importance_type="gain",
            interaction_constraints="",
            learning_rate=0.01,
            max_delta_step=0,
            max_depth=3,
            min_child_weight=10,
            # missing=nan,
            monotone_constraints="()",
            n_estimators=1000,
            n_jobs=8,
            nthread=1,
            num_parallel_tree=1,
            objective="binary:logistic",
            random_state=0,
            reg_alpha=0,
            reg_lambda=1,
            scale_pos_weight=1,
            silent=True,
            subsample=0.8,
            tree_method="exact",
            validate_parameters=1,
            verbosity=2,
        )

        cls.fit(x_train, y_train)

        y_valid_pred = cls.predict_proba(x_valid)[:, 1]
        score = alaska_weighted_auc(y_valid, y_valid_pred)
        cv_scores.append(score)

        if test_pred is not None:
            test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n
        else:
            test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n

    for s in cv_scores:
        print(s)
    print(np.mean(cv_scores), np.std(cv_scores))

    with_logits_sfx = "_with_logits" if with_logits else ""

    submit_fname = os.path.join(
        output_dir,
        f"xgb_cls_{np.mean(cv_scores):.4f}_{checksum}{with_logits_sfx}.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
예제 #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("predictions", nargs="+")
    parser.add_argument("-w", "--workers", type=int, default=0, help="")
    parser.add_argument("-dd",
                        "--data-dir",
                        type=str,
                        default="data",
                        help="Data directory")
    args = parser.parse_args()

    targets = fs.find_in_dir(os.path.join(
        args.data_dir, "tier3", "masks")) + fs.find_in_dir(
            os.path.join(args.data_dir, "train", "masks"))
    targets_post = dict((fs.id_from_fname(fname), fname) for fname in targets
                        if "_post_" in fname)

    df = defaultdict(list)

    postprocessings = {
        "naive": make_predictions_naive,
        "dominant": make_predictions_dominant,
        "floodfill": make_predictions_floodfill,
    }

    for predictions_dir in args.predictions:
        try:
            prediction_files = fs.find_in_dir(predictions_dir)
            prediction_files_post = dict((fs.id_from_fname(fname), fname)
                                         for fname in prediction_files
                                         if "_post_" in fname)

            y_true_filenames = [
                targets_post[image_id_post]
                for image_id_post in prediction_files_post.keys()
            ]
            y_pred_filenames = [
                prediction_files_post[image_id_post]
                for image_id_post in prediction_files_post.keys()
            ]

            for name, fn in postprocessings.items():
                score, localization_f1, damage_f1, damage_f1s = optimize_postprocessing(
                    y_pred_filenames,
                    y_true_filenames,
                    postprocessing_fn=fn,
                    workers=args.workers)

                print(name, score)

                df["samples"].append(len(y_pred_filenames))
                df["predictions_dir"].append(predictions_dir)
                df["postprocessing"].append(name)
                df["score"].append(score)
                df["localization_f1"].append(localization_f1)
                df["damage_f1"].append(damage_f1)
        except Exception as e:
            print("Failed to process", predictions_dir, e)

    df = pd.DataFrame.from_dict(df)
    print(df)

    current_time = datetime.now().strftime("%b%d_%H_%M")

    df.to_csv(f"postprocessing_eval_{current_time}.csv", index=None)
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        #
        "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16",
        "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16"
        #
        #
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    with_logits = True

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=with_logits,
                                tta_logits=with_logits)
    # Force target to be binary
    y = (y > 0).astype(int)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=with_logits,
                                     tta_logits=with_logits)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    params = {
        "boosting_type": ["gbdt", "dart", "rf", "goss"],
        "num_leaves": [16, 32, 64, 128],
        "reg_alpha": [0, 0.01, 0.1, 0.5],
        "reg_lambda": [0, 0.01, 0.1, 0.5],
        "learning_rate": [0.001, 0.01, 0.1, 0.5],
        "n_estimators": [32, 64, 126, 512],
        "max_depth": [2, 4, 8],
        "min_child_samples": [20, 40, 80, 100],
    }

    lgb_estimator = lgb.LGBMClassifier(objective="binary", silent=True)

    random_search = RandomizedSearchCV(
        lgb_estimator,
        param_distributions=params,
        scoring=make_scorer(alaska_weighted_auc,
                            greater_is_better=True,
                            needs_proba=True),
        n_jobs=3,
        n_iter=50,
        cv=group_kfold.split(x, y, groups=image_ids),
        verbose=2,
        random_state=42,
    )

    # Here we go
    random_search.fit(x, y)

    test_pred = random_search.predict_proba(x_test)[:, 1]
    print(test_pred)

    submit_fname = os.path.join(
        output_dir, f"lgbm_gs_{random_search.best_score_:.4f}_{checksum}.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)

    print("\n All results:")
    print(random_search.cv_results_)
    print("\n Best estimator:")
    print(random_search.best_estimator_)
    print(random_search.best_score_)
    print("\n Best hyperparameters:")
    print(random_search.best_params_)
    results = pd.DataFrame(random_search.cv_results_)
    results.to_csv("lgbm-random-grid-search-results-01.csv", index=False)