def get_loaders(transform): open_fn = lambda x: {"features": x[0], "targets": x[1]} loaders = collections.OrderedDict() train_loader = utils.get_loader( train_data, open_fn=open_fn, dict_transform=transform, batch_size=bs, num_workers=num_workers, shuffle=True, ) valid_loader = utils.get_loader( valid_data, open_fn=open_fn, dict_transform=transform, batch_size=bs, num_workers=num_workers, shuffle=False, ) loaders["train"] = train_loader loaders["valid"] = valid_loader return loaders
def get_loaders(open_fn: Callable, train_transforms_fn, valid_transforms_fn, batch_size: int = 8, num_workers: int = 20, sampler=None) -> OrderedDict: train_loader = utils.get_loader( train_data, open_fn=open_fn, dict_transform=train_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=sampler is None, # shuffle data only if Sampler is not specified (PyTorch requirement) sampler=sampler, drop_last=True, ) valid_loader = utils.get_loader( valid_data, open_fn=open_fn, dict_transform=valid_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=False, sampler=None, drop_last=True, ) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader return loaders
def get_loaders(train_transforms_fn, valid_transforms_fn, config, batch_size: int = 8, num_workers: int = 20, sampler=None) -> OrderedDict: train_data, valid_data = get_datasets(config) open_fn = ReaderCompose([ ImageReader(input_key="filepath", output_key="features", rootpath=config.root_images), ScalarReader(input_key="disease_type", output_key="targets", default_value=-1, dtype=np.int64), ScalarReader(input_key="disease_type", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=config.num_classes) ]) train_loader = utils.get_loader( train_data, open_fn=open_fn, dict_transform=train_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=sampler is None, # shuffle data only if Sampler is not specified (PyTorch requirement) sampler=sampler, drop_last=True, ) valid_loader = utils.get_loader( valid_data, open_fn=open_fn, dict_transform=valid_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=False, sampler=None, drop_last=True, ) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader return loaders
def validate(model, dataset, val_dataset, config): loaders = collections.OrderedDict() train_loader = utils.get_loader(dataset, open_fn=lambda x: {"input_audio": x[-1], "input_video": x[1], "targets": x[0]}, batch_size=config.batch_size, num_workers=config.workers, shuffle=True) val_loader = utils.get_loader(val_dataset, open_fn=lambda x: {"input_audio": x[-1], "input_video": x[1], "targets": x[0]}, batch_size=config.batch_size, num_workers=config.workers, shuffle=False) loaders["valid"] = val_loader runner = SupervisedRunner(input_key=["input_audio", "input_video"]) # parameters of the model in forward(...) runner.infer(model, loaders, callbacks=collections.OrderedDict({"snr_callback": SNRCallback(), "sdr_callback": SDRCallback()}), verbose=True)
def main(args, _=None): global IMG_SIZE IMG_SIZE = (args.img_size, args.img_size) model = ResnetEncoder(arch=args.arch, pooling=args.pooling) model = model.eval() model, _, _, _, device = utils.process_components(model=model) images_df = pd.read_csv(args.in_csv) images_df = images_df.reset_index().drop("index", axis=1) images_df = list(images_df.to_dict("index").values()) open_fn = ImageReader(input_key=args.img_col, output_key="image", datapath=args.datapath) dataloader = utils.get_loader(images_df, open_fn, batch_size=args.batch_size, num_workers=args.num_workers, dict_transform=dict_transformer) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: features_ = model(batch["image"].to(device)) features_ = features_.cpu().detach().numpy() features.append(features_) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def get_loader(phase, dataset, open_fn, batch_size, num_workers, img_size): assert phase in {'train', 'valid'}, f'invalid phase: {phase}' transforms_fn = get_transform(phase=phase, img_size=img_size) data_loader = dutil.get_loader(dataset, open_fn=open_fn, dict_transform=transforms_fn, shuffle=(phase == 'train'), drop_last=(phase == 'train'), batch_size=batch_size, num_workers=num_workers, sampler=None) return data_loader
def get_loaders(train_data: 'pd.DataFrame', valid_data: 'pd.DataFrame', open_fn: 'Callable', train_transforms_fn, valid_transforms_fn, batch_size: int = 64, num_workers: int = 4, sampler=None) -> collections.OrderedDict: train_loader = get_loader( train_data, open_fn=open_fn, dict_transform=train_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=sampler is None, # shuffle data only if Sampler is not specified (PyTorch requirement) sampler=sampler, drop_last=True, ) valid_loader = get_loader( valid_data, open_fn=open_fn, dict_transform=valid_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=False, sampler=None, drop_last=True, ) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader return loaders
def main(args, _=None): """Run the ``catalyst-data image2embeddings`` script.""" global IMG_SIZE utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) IMG_SIZE = (args.img_size, args.img_size) # noqa: WPS442 if args.traced_model is not None: device = utils.get_device() model = torch.jit.load(str(args.traced_model), map_location=device) else: model = ResnetEncoder(arch=args.arch, pooling=args.pooling) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) open_fn = ImageReader(input_key=args.img_col, output_key="image", rootpath=args.rootpath) dataloader = utils.get_loader( df, open_fn, batch_size=args.batch_size, num_workers=args.num_workers, dict_transform=dict_transformer, ) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: batch_features = model(batch["image"].to(device)) batch_features = batch_features.cpu().detach().numpy() features.append(batch_features) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def main(args, _=None): """Run the ``catalyst-data text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) if hasattr(args, "in_huggingface"): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if hasattr(args, "in_model"): checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) bert_output = model(**batch) mask = (batch["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if utils.check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states features_ = process_bert_output( bert_output=bert_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" _, embedding_size = value.shape features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" features[name_][indices] = _detach(value)
def main(args, _=None): batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=get_features, tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} poolings = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) features_ = model(**batch) # create storage based on network output if idx == 0: # class _, embedding_size = features_[1].shape features["class"] = np.memmap( f"{args.out_prefix}.class.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" _, _, embedding_size = feature_.shape poolings[name_] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) else: # last _, _, embedding_size = features_[0].shape poolings["last"] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features["last"] = np.memmap( f"{args.out_prefix}.last.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) features["class"][indices] = _detach(features_[1]) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" feature_ = poolings[name_](feature_) features[name_][indices] = _detach(feature_) else: feature_ = poolings[name_](features_[0]) features["last"][indices] = _detach(feature_)
augment_fn=transforms.Normalize( (0.5,), (0.5,))), Augmentor( dict_key="targets", augment_fn=lambda x: \ torch.from_numpy(x.copy().astype(np.float32) / 255.).unsqueeze_(0)) ]) open_fn = lambda x: {"features": x[0], "targets": x[1]} loaders = collections.OrderedDict() train_loader = utils.get_loader(train_data, open_fn=open_fn, dict_transform=data_transform, batch_size=bs, num_workers=num_workers, shuffle=True) valid_loader = utils.get_loader(valid_data, open_fn=open_fn, dict_transform=data_transform, batch_size=bs, num_workers=num_workers, shuffle=False) loaders["train"] = train_loader loaders["valid"] = valid_loader # # Model
def train( model: torch.nn.Module, dataset: torch.utils.data.Dataset, optimizer: torch.optim.Optimizer, criterion: torch.nn.Module, config: ParamConfig, val_dataset: torch.utils.data.Dataset = None, logdir: str = "./logdir", resume: Union[str, None] = "logdir/checkpoints/best_full.pth") -> None: """ train the model with specified paremeters Args: model: neural network model dataset: training dataset optimizer: optimizer criterion: loss function val_dataset: validation dataset logdir: logdir location to save checkpoints resume: path where the partially trained model is stored """ loaders = collections.OrderedDict() train_loader = utils.get_loader(dataset, open_fn=lambda x: { "input_audio": x[-1], "input_video": x[1], "targets": x[0] }, batch_size=config.batch_size, num_workers=config.workers, shuffle=True) val_loader = utils.get_loader(val_dataset, open_fn=lambda x: { "input_audio": x[-1], "input_video": x[1], "targets": x[0] }, batch_size=config.batch_size, num_workers=config.workers, shuffle=True) loaders = {"train": train_loader, "valid": val_loader} scheduler = torch.optim.lr_scheduler.CyclicLR( optimizer, base_lr=config.learning_rate, max_lr=config.learning_rate * 10, step_size_up=4 * len(train_loader), mode="triangular", cycle_momentum=False) runner = SupervisedRunner(input_key=["input_audio", "input_video"]) runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, verbose=True, num_epochs=config.epochs, resume=resume, callbacks=collections.OrderedDict({ "snr_callback": SNRCallback(), "sched_callback": SchedulerCallback(mode="batch") }))
def create_dataloders( train_file: str, valid_file: str, root_folder: str, meta_info_file: str, num_classes: int, one_hot_encoding: bool, bs: int, num_workers: int, augmenters: Dict = None, ): train_data = _prepare(train_file, root_folder) valid_data = _prepare(valid_file, root_folder) train_augmenter = augmenters['train'] valid_augmenter = augmenters['valid'] train_transforms_fn = transforms.Compose([ Augmentor( dict_key="features", augment_fn=lambda x: train_augmenter(samples=x, sample_rate=16000)) ]) # Similarly for the validation part of the dataset. # we only perform squaring, normalization and ToTensor valid_transforms_fn = transforms.Compose([ Augmentor( dict_key="features", augment_fn=lambda x: valid_augmenter(samples=x, sample_rate=16000)) ]) compose = [ AudioReader( input_key="filepath", output_key="features", ), ScalarReader(input_key="label", output_key="targets", default_value=-1, dtype=np.int64), ] if one_hot_encoding: compose.append( ScalarReader( input_key="label", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=num_classes, )) open_fn = ReaderCompose(compose) train_loader = catalyst_utils.get_loader( train_data, open_fn=open_fn, dict_transform=train_transforms_fn, batch_size=bs, num_workers=num_workers, shuffle= True, # shuffle data only if Sampler is not specified (PyTorch requirement) ) valid_loader = catalyst_utils.get_loader( valid_data, open_fn=open_fn, dict_transform=valid_transforms_fn, batch_size=bs, num_workers=1, shuffle=False, ) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader return loaders