def guess_truncate(data_dir, split, max_obj) -> int: """Reading large mmap could be slow, so we find minimum truncate files possible""" minimum = math.inf for file in glob.glob(object_file(data_dir, split) + "*"): match = re.search(object_file(data_dir, split) + "\.(\d+)", file) if match: trunc = int(match.group(1)) if trunc < max_obj: continue minimum = min(minimum, trunc) if minimum == math.inf: minimum = 0 logger.info(f"find minimum truncate of {data_dir}-{split}: {minimum}") return minimum
def __init__(self, data_dir, split="train", max_obj=20): self.data_dir = data_dir self.sent_num = np.load(sent_num_file(data_dir, split)) self.offsets = np.load(offsets_file(data_dir, split)) self.total_sent_num = self.offsets[-1] + self.sent_num[-1] self.dim = 2048 # todo add x,y,w,h self.max_obj = max_obj # max-obj when getting item warmup_mmap_file(object_file(data_dir, split, 0)) print(self.total_sent_num, self.MAX_OBJ, self.dim) self.objects = np.memmap(object_file(data_dir, split, 0), dtype=np.float32, mode='r', shape=(self.total_sent_num, self.MAX_OBJ, self.dim)) warmup_mmap_file(object_mask_file(data_dir, split, 0)) self.objects_mask = np.memmap(object_mask_file(data_dir, split, 0), dtype=np.bool, mode='r', shape=(self.total_sent_num, self.MAX_OBJ))
def main(): parser = argparse.ArgumentParser(description='video-data pre-processing.') parser.add_argument('--origin-dir', required=True, help='origin data directory.') parser.add_argument('--output-dir', required=True, help='output directory.') parser.add_argument('--split', type=str, default="train", help='split of dataset, train/valid/test') args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) image_files = iterate_img_dir( os.path.join(args.origin_dir, f"{args.split}_images")) total_num = len(image_files) print( f"Computing {RCNN_FEATURE_DIM}-dim features for a maximum of {MAX_OBJECTS} object in {total_num} pictures" ) objects = np.memmap(object_file(args.output_dir, args.split), dtype=np.float32, mode='w+', shape=(total_num, MAX_OBJECTS, RCNN_FEATURE_DIM)) objects_mask = np.memmap(object_mask_file(args.output_dir, args.split), dtype=np.bool, mode='w+', shape=(total_num, MAX_OBJECTS)) success = [False] * total_num while not all(success): for img_idx, img_file in tqdm(enumerate( iterate_img_dir( os.path.join(args.origin_dir, f"{args.split}_images"))), desc="Gathering Faster-RCNN feature"): try: if success[img_idx]: continue npy_file = img_file + ".npy" rcnn_features = np.load(npy_file, allow_pickle=True)[()] objects_features = rcnn_features["features"] num_object = objects_features.shape[0] objects[img_idx][:num_object] = objects_features objects_mask[img_idx][:num_object] = True objects_mask[img_idx][num_object:] = False success[img_idx] = True except Exception as e: print(e) continue
@file: truncate_data.py @time: 2020/12/25 11:04 @desc: Truncate mmap file to smaller ones for training acceleration. """ import numpy as np from video_dialogue_model.data.utils import object_file, object_mask_file from tqdm import tqdm MAX_OBJECTS, RCNN_FEATURE_DIM = 100, 2048 TRUNCATE_OBJECTS = 20 output_dir = "./preprocessed_data" for split, total_num in zip(["train", "valid", "test"], [865694, 109109, 111346]): objects = np.memmap(object_file(output_dir, split), dtype=np.float32, mode='r', shape=(total_num, MAX_OBJECTS, RCNN_FEATURE_DIM)) objects_mask = np.memmap(object_mask_file(output_dir, split), dtype=np.bool, mode='r', shape=(total_num, MAX_OBJECTS)) truncate_objects = np.memmap( object_file(output_dir, split) + f".{TRUNCATE_OBJECTS}", dtype=np.float32, mode='w+', shape=(total_num, TRUNCATE_OBJECTS, RCNN_FEATURE_DIM)) truncate_objects_mask = np.memmap(object_mask_file(output_dir, split) + f".{TRUNCATE_OBJECTS}",