Пример #1
0
def prepare_csv():
    paths = get_paths()
    dataset_path = paths["train_data.path"]
    print(paths)

    df_map = pd.read_csv(
        osp.join(dataset_path, "annotation/train_metadata.csv"))
    df_label = pd.read_csv(
        osp.join(dataset_path, "annotation/train_labels.csv"))
    labels = list(df_label.columns)[1:]
    print(labels)

    pairs = []
    for label in tqdm(labels, total=len(labels)):
        tdf_label = df_label[df_label[label] == 1]
        tdf_map = df_map[df_map["seq_id"].isin(tdf_label["seq_id"])]

        for stem in tdf_map["file_name"]:
            season = stem.split("/")[0]
            for part in range(1, 6):
                stem_fn = stem.replace(f"{season}/", f"512_{season}_{part}/")

                in_file = osp.join(dataset_path, stem_fn)
                if osp.exists(in_file):
                    pairs.append((in_file, label))

    paths, labels = zip(*pairs)
    df = pd.DataFrame()
    df["path"], df["label"] = paths, labels
    df.to_csv("../tables/data_tmp.csv", index=False)
Пример #2
0
def filter_filelist():
    paths = get_paths()
    dataset_path = paths["train_data.path"]

    df_meta = pd.read_csv(osp.join(dataset_path, "annotation/file_list.csv"))
    print(df_meta.head())

    df_loss = pd.read_csv("tmp/train_loss.csv")
    print(df_loss.shape)
    print(df_loss.head())

    thr_loss = df_loss.loc[int(0.8 * df_loss.shape[0]), "loss"]  # thr_loss = 0.0012901991140097382 #
    print(thr_loss)

    high = df_loss[df_loss["loss"] > thr_loss]
    low = df_loss[df_loss["loss"] < thr_loss]

    low = low.sample(frac=0.1)

    df_loss = pd.concat([low, high])
    print(df_loss.shape)

    df_filter = df_meta[df_meta["file_name"].isin(df_loss["ids"])]
    print(df_filter.shape)
    df_filter.to_csv(osp.join(dataset_path, "annotation/file_list_filter.csv"), index=False)
Пример #3
0
def prepare_paths(df_path):
    paths = get_paths()
    dataset_path = paths["data.path"]

    df_map = pd.read_csv(osp.join(dataset_path, "annotation/train_metadata.csv"))
    df_labels = pd.read_csv(osp.join(dataset_path, "annotation/train_labels.csv"))
    print(df_labels.head())

    out = []
    for file_name, seq_id in tqdm(zip(df_map["file_name"], df_map["seq_id"]), total=df_map.shape[0]):
        for i in range(1, 5):
            season = file_name.split("/")[0]
            full_path = file_name.replace(f"{season}/", f"512_{season}_{i}/")
            full_path = osp.join(dataset_path, full_path)
            if osp.exists(full_path):
                out.append((full_path.replace(f"{dataset_path}/", ""), seq_id))
                break

    filenames, seqs = zip(*out)
    df = pd.DataFrame()
    df["file_name"] = filenames
    df["seq_id"] = seqs
    df["label"] = [0] * df.shape[0]

    tdf = df_labels[df_labels["empty"] == 1]
    df.loc[df["seq_id"].isin(tdf["seq_id"]), "label"] = 1

    for label in RARE:
        tdf = df_labels[df_labels[label] == 1]
        df.loc[df["seq_id"].isin(tdf["seq_id"]), "label"] = 2

    print(df.head(100))
    print(df.shape)
    print(np.sum(df["label"].values))
    df.to_csv(df_path, index=False)
Пример #4
0
def check_filelist():
    paths = get_paths()
    dataset_path = paths["train_data.path"]

    df_meta = pd.read_csv(osp.join(dataset_path, "annotation/file_list.csv"))
    df_meta = df_meta[df_meta["labels"].str.contains(" ")]
    print(df_meta.head())
    print(df_meta.shape)
Пример #5
0
def check_iter():
    paths = get_paths()
    vloader = InferLoader(mode="val",
                          path=paths["train_data.path"],
                          batch_size=1)
    print(len(vloader))
    for idx, batch in enumerate(vloader):
        print(batch["seq_id"])
        print(batch["images"][0].shape)
        print(batch["label"])

        break
Пример #6
0
def make_tst():
    path = get_paths()["train_data.path"]
    df_path = osp.join(path, "annotation/all_paths.csv")
    df_meta = pd.read_csv(df_path)
    valid = df_meta[df_meta["file_name"].str.contains("_S10_")]

    valid = valid.head(1000)

    valid.to_csv(osp.join(path, "test_metadata.csv"), index=False)
    valid = valid.drop_duplicates("seq_id")

    sample = pd.DataFrame()
    sample["seq_id"] = valid["seq_id"]
    for label in LABELS:
        sample[label] = [0] * sample.shape[0]

    sample.to_csv(osp.join(path, "submission_format.csv"), index=False)
Пример #7
0
def main():
    path = get_paths()
    val_dataloader = HakunaPrefetchedLoader(mode="val", path=path["train_data.path"], long_side=256, batch_size=8)
    print(len(val_dataloader))

    for idx, batch in enumerate(val_dataloader):
        images, targets = batch
        images = images.cpu().numpy()
        targets = targets.cpu().numpy()
        plt.figure()
        for i in range(images.shape[0]):
            plt.subplot(2, 4, i + 1)
            image = np.transpose(images[i], (1, 2, 0))
            image_show = np.uint8(255 * (IMAGENET_STD * image + IMAGENET_MEAN))
            plt.title(np.argmax(targets[i]))
            plt.imshow(image_show)

        plt.show()
Пример #8
0
def prepare_filelist():
    paths = get_paths()
    dataset_path = paths["train_data.path"]

    df_meta = pd.read_csv(osp.join(dataset_path, "annotation/all_paths.csv"))
    df_labels = pd.read_csv(osp.join(dataset_path, "annotation/train_labels.csv"))
    df_labels = df_labels[df_labels["seq_id"].isin(df_meta["seq_id"])]

    seq2index = dict([(seq, n) for n, seq in enumerate(df_labels["seq_id"])])
    labels_arr = df_labels[LABELS].values

    index_arr = np.nonzero(labels_arr)
    print(index_arr[:10])
    print([seq2index.get(seq) for seq in df_meta["seq_id"][:10]])
    tmp = [np.nonzero(labels_arr[seq2index.get(seq), :]) for seq in df_meta["seq_id"]]  # df_meta['labels']

    df_meta["labels"] = [" ".join(list(el[0].astype(str))) for el in tmp]
    print(df_meta.head())
    df_meta.to_csv(osp.join(dataset_path, "annotation/file_list.csv"), index=False)
Пример #9
0
def make_val():
    path = get_paths()["train_data.path"]
    df_path = osp.join(path, "annotation/all_paths.csv")
    df_meta = pd.read_csv(df_path)
    valid = df_meta[df_meta["file_name"].str.contains("_S10_")]

    df_path = osp.join(path, "annotation/valid.csv")

    valid.to_csv(df_path, index=False)

    df_meta = pd.read_csv(df_path)
    print(df_meta)

    test_metadata = df_meta.groupby("seq_id").first().reset_index()

    print(test_metadata.head())

    groups = df_meta.groupby("seq_id")
    print(len(groups))
    for sample_id, group in groups:
        print(sample_id)
        print(group)
Пример #10
0
def main():
    paths = get_paths()
    dataset_path = paths["data.path"]

    val_season = "S10"

    # df_map = pd.read_csv(osp.join(dataset_path, "annotation/train_metadata.csv"))
    # df_label = pd.read_csv(osp.join(dataset_path, "annotation/train_labels.csv"))

    # df_map = df_map[df_map["file_name"].str.contains(val_season)]
    # os.makedirs('tmp', exist_ok=True)
    # df_map.to_csv('tmp/val.csv', index=False)
    df_map = pd.read_csv("tmp/val.csv")
    print(df_map.head())

    out = []
    for file_name, seq_id in tqdm(zip(df_map["file_name"], df_map["seq_id"]),
                                  total=df_map.shape[0]):
        for i in range(1, 5):
            full_path = file_name.replace(f"{val_season}/",
                                          f"512_{val_season}_{i}/")
            full_path = osp.join(dataset_path, full_path)
            if osp.exists(full_path):
                out.append((full_path.replace(f"{dataset_path}/",
                                              "../"), seq_id))
                # out_seq.append(seq_id)
                # print("ololo")
                break

    filenames, seqs = zip(*out)
    df = pd.DataFrame()
    df["file_name"] = filenames
    df["seq_id"] = seqs
    print(df.head())
    print(df.shape)
    df = df.head(100000)
    df.to_csv(osp.join(dataset_path, "annotation/valid_metadata_100k.csv"),
              index=False)
Пример #11
0
def check_paths():
    paths = get_paths()
    dataset_path = paths["train_data.path"]

    df_map = pd.read_csv(osp.join(dataset_path, "annotation/train_metadata.csv"))
    print(df_map.head())

    out = []
    for file_name, seq_id in tqdm(zip(df_map["file_name"], df_map["seq_id"]), total=df_map.shape[0]):
        for i in range(1, 5):
            season = file_name.split("/")[0]
            full_path = file_name.replace(f"{season}/", f"512_{season}_{i}/")
            full_path = osp.join(dataset_path, full_path)
            if osp.exists(full_path):
                out.append((full_path.replace(f"{dataset_path}/", ""), seq_id))
                break

    filenames, seqs = zip(*out)
    df = pd.DataFrame()
    df["file_name"] = filenames
    df["seq_id"] = seqs
    print(df.head())
    print(df.shape)
    df.to_csv(osp.join(dataset_path, "annotation/all_paths.csv"), index=False)
Пример #12
0
import os.path as osp

import cv2
import matplotlib.pyplot as plt
import pandas as pd
from thunder_hammer.utils import get_paths

path = get_paths()
data_path = get_paths()["train_data.path"]

labels = [
    "aardvark",
    "aardwolf",
    "baboon",
    "bat",
    "batearedfox",
    "buffalo",
    "bushbuck",
    "caracal",
    "cattle",
    "cheetah",
    "civet",
    "dikdik",
    "duiker",
    "eland",
    "elephant",
    "empty",
    "gazellegrants",
    "gazellethomsons",
    "genet",
    "giraffe",
Пример #13
0
import matplotlib.pyplot as plt

from nvidia.dali.plugin.pytorch import DALIClassificationIterator
import torch

# import types
import numpy as np
import nvidia.dali.ops as ops
import pandas as pd
from nvidia import dali

# batch_size = 16
from thunder_hammer.utils import get_paths

PATHS = get_paths()
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406])
IMAGENET_STD = np.array([0.229, 0.224, 0.225])


class ExternalInputIterator(object):
    def __init__(self, mode, data_path, batch_size):
        assert mode in ["train", "val", "test"], f"unknown mode {mode}"
        self.path = data_path
        self.batch_size = batch_size

        if mode == "train":
            ann_filename = "annotation/file_list_filter.csv"
        else:
            ann_filename = "annotation/file_list.csv"