def prepare_csv(): paths = get_paths() dataset_path = paths["train_data.path"] print(paths) df_map = pd.read_csv( osp.join(dataset_path, "annotation/train_metadata.csv")) df_label = pd.read_csv( osp.join(dataset_path, "annotation/train_labels.csv")) labels = list(df_label.columns)[1:] print(labels) pairs = [] for label in tqdm(labels, total=len(labels)): tdf_label = df_label[df_label[label] == 1] tdf_map = df_map[df_map["seq_id"].isin(tdf_label["seq_id"])] for stem in tdf_map["file_name"]: season = stem.split("/")[0] for part in range(1, 6): stem_fn = stem.replace(f"{season}/", f"512_{season}_{part}/") in_file = osp.join(dataset_path, stem_fn) if osp.exists(in_file): pairs.append((in_file, label)) paths, labels = zip(*pairs) df = pd.DataFrame() df["path"], df["label"] = paths, labels df.to_csv("../tables/data_tmp.csv", index=False)
def filter_filelist(): paths = get_paths() dataset_path = paths["train_data.path"] df_meta = pd.read_csv(osp.join(dataset_path, "annotation/file_list.csv")) print(df_meta.head()) df_loss = pd.read_csv("tmp/train_loss.csv") print(df_loss.shape) print(df_loss.head()) thr_loss = df_loss.loc[int(0.8 * df_loss.shape[0]), "loss"] # thr_loss = 0.0012901991140097382 # print(thr_loss) high = df_loss[df_loss["loss"] > thr_loss] low = df_loss[df_loss["loss"] < thr_loss] low = low.sample(frac=0.1) df_loss = pd.concat([low, high]) print(df_loss.shape) df_filter = df_meta[df_meta["file_name"].isin(df_loss["ids"])] print(df_filter.shape) df_filter.to_csv(osp.join(dataset_path, "annotation/file_list_filter.csv"), index=False)
def prepare_paths(df_path): paths = get_paths() dataset_path = paths["data.path"] df_map = pd.read_csv(osp.join(dataset_path, "annotation/train_metadata.csv")) df_labels = pd.read_csv(osp.join(dataset_path, "annotation/train_labels.csv")) print(df_labels.head()) out = [] for file_name, seq_id in tqdm(zip(df_map["file_name"], df_map["seq_id"]), total=df_map.shape[0]): for i in range(1, 5): season = file_name.split("/")[0] full_path = file_name.replace(f"{season}/", f"512_{season}_{i}/") full_path = osp.join(dataset_path, full_path) if osp.exists(full_path): out.append((full_path.replace(f"{dataset_path}/", ""), seq_id)) break filenames, seqs = zip(*out) df = pd.DataFrame() df["file_name"] = filenames df["seq_id"] = seqs df["label"] = [0] * df.shape[0] tdf = df_labels[df_labels["empty"] == 1] df.loc[df["seq_id"].isin(tdf["seq_id"]), "label"] = 1 for label in RARE: tdf = df_labels[df_labels[label] == 1] df.loc[df["seq_id"].isin(tdf["seq_id"]), "label"] = 2 print(df.head(100)) print(df.shape) print(np.sum(df["label"].values)) df.to_csv(df_path, index=False)
def check_filelist(): paths = get_paths() dataset_path = paths["train_data.path"] df_meta = pd.read_csv(osp.join(dataset_path, "annotation/file_list.csv")) df_meta = df_meta[df_meta["labels"].str.contains(" ")] print(df_meta.head()) print(df_meta.shape)
def check_iter(): paths = get_paths() vloader = InferLoader(mode="val", path=paths["train_data.path"], batch_size=1) print(len(vloader)) for idx, batch in enumerate(vloader): print(batch["seq_id"]) print(batch["images"][0].shape) print(batch["label"]) break
def make_tst(): path = get_paths()["train_data.path"] df_path = osp.join(path, "annotation/all_paths.csv") df_meta = pd.read_csv(df_path) valid = df_meta[df_meta["file_name"].str.contains("_S10_")] valid = valid.head(1000) valid.to_csv(osp.join(path, "test_metadata.csv"), index=False) valid = valid.drop_duplicates("seq_id") sample = pd.DataFrame() sample["seq_id"] = valid["seq_id"] for label in LABELS: sample[label] = [0] * sample.shape[0] sample.to_csv(osp.join(path, "submission_format.csv"), index=False)
def main(): path = get_paths() val_dataloader = HakunaPrefetchedLoader(mode="val", path=path["train_data.path"], long_side=256, batch_size=8) print(len(val_dataloader)) for idx, batch in enumerate(val_dataloader): images, targets = batch images = images.cpu().numpy() targets = targets.cpu().numpy() plt.figure() for i in range(images.shape[0]): plt.subplot(2, 4, i + 1) image = np.transpose(images[i], (1, 2, 0)) image_show = np.uint8(255 * (IMAGENET_STD * image + IMAGENET_MEAN)) plt.title(np.argmax(targets[i])) plt.imshow(image_show) plt.show()
def prepare_filelist(): paths = get_paths() dataset_path = paths["train_data.path"] df_meta = pd.read_csv(osp.join(dataset_path, "annotation/all_paths.csv")) df_labels = pd.read_csv(osp.join(dataset_path, "annotation/train_labels.csv")) df_labels = df_labels[df_labels["seq_id"].isin(df_meta["seq_id"])] seq2index = dict([(seq, n) for n, seq in enumerate(df_labels["seq_id"])]) labels_arr = df_labels[LABELS].values index_arr = np.nonzero(labels_arr) print(index_arr[:10]) print([seq2index.get(seq) for seq in df_meta["seq_id"][:10]]) tmp = [np.nonzero(labels_arr[seq2index.get(seq), :]) for seq in df_meta["seq_id"]] # df_meta['labels'] df_meta["labels"] = [" ".join(list(el[0].astype(str))) for el in tmp] print(df_meta.head()) df_meta.to_csv(osp.join(dataset_path, "annotation/file_list.csv"), index=False)
def make_val(): path = get_paths()["train_data.path"] df_path = osp.join(path, "annotation/all_paths.csv") df_meta = pd.read_csv(df_path) valid = df_meta[df_meta["file_name"].str.contains("_S10_")] df_path = osp.join(path, "annotation/valid.csv") valid.to_csv(df_path, index=False) df_meta = pd.read_csv(df_path) print(df_meta) test_metadata = df_meta.groupby("seq_id").first().reset_index() print(test_metadata.head()) groups = df_meta.groupby("seq_id") print(len(groups)) for sample_id, group in groups: print(sample_id) print(group)
def main(): paths = get_paths() dataset_path = paths["data.path"] val_season = "S10" # df_map = pd.read_csv(osp.join(dataset_path, "annotation/train_metadata.csv")) # df_label = pd.read_csv(osp.join(dataset_path, "annotation/train_labels.csv")) # df_map = df_map[df_map["file_name"].str.contains(val_season)] # os.makedirs('tmp', exist_ok=True) # df_map.to_csv('tmp/val.csv', index=False) df_map = pd.read_csv("tmp/val.csv") print(df_map.head()) out = [] for file_name, seq_id in tqdm(zip(df_map["file_name"], df_map["seq_id"]), total=df_map.shape[0]): for i in range(1, 5): full_path = file_name.replace(f"{val_season}/", f"512_{val_season}_{i}/") full_path = osp.join(dataset_path, full_path) if osp.exists(full_path): out.append((full_path.replace(f"{dataset_path}/", "../"), seq_id)) # out_seq.append(seq_id) # print("ololo") break filenames, seqs = zip(*out) df = pd.DataFrame() df["file_name"] = filenames df["seq_id"] = seqs print(df.head()) print(df.shape) df = df.head(100000) df.to_csv(osp.join(dataset_path, "annotation/valid_metadata_100k.csv"), index=False)
def check_paths(): paths = get_paths() dataset_path = paths["train_data.path"] df_map = pd.read_csv(osp.join(dataset_path, "annotation/train_metadata.csv")) print(df_map.head()) out = [] for file_name, seq_id in tqdm(zip(df_map["file_name"], df_map["seq_id"]), total=df_map.shape[0]): for i in range(1, 5): season = file_name.split("/")[0] full_path = file_name.replace(f"{season}/", f"512_{season}_{i}/") full_path = osp.join(dataset_path, full_path) if osp.exists(full_path): out.append((full_path.replace(f"{dataset_path}/", ""), seq_id)) break filenames, seqs = zip(*out) df = pd.DataFrame() df["file_name"] = filenames df["seq_id"] = seqs print(df.head()) print(df.shape) df.to_csv(osp.join(dataset_path, "annotation/all_paths.csv"), index=False)
import os.path as osp import cv2 import matplotlib.pyplot as plt import pandas as pd from thunder_hammer.utils import get_paths path = get_paths() data_path = get_paths()["train_data.path"] labels = [ "aardvark", "aardwolf", "baboon", "bat", "batearedfox", "buffalo", "bushbuck", "caracal", "cattle", "cheetah", "civet", "dikdik", "duiker", "eland", "elephant", "empty", "gazellegrants", "gazellethomsons", "genet", "giraffe",
import matplotlib.pyplot as plt from nvidia.dali.plugin.pytorch import DALIClassificationIterator import torch # import types import numpy as np import nvidia.dali.ops as ops import pandas as pd from nvidia import dali # batch_size = 16 from thunder_hammer.utils import get_paths PATHS = get_paths() IMAGENET_MEAN = np.array([0.485, 0.456, 0.406]) IMAGENET_STD = np.array([0.229, 0.224, 0.225]) class ExternalInputIterator(object): def __init__(self, mode, data_path, batch_size): assert mode in ["train", "val", "test"], f"unknown mode {mode}" self.path = data_path self.batch_size = batch_size if mode == "train": ann_filename = "annotation/file_list_filter.csv" else: ann_filename = "annotation/file_list.csv"