def get_kiccho_embeddings( exp: str, test_df: pd.DataFrame, num_workers: Optional[int] = None, batch_size: Optional[int] = None, image_dir: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray, str, np.ndarray]: """ input exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く test_df: test.csvをpd.read_csvで読んだもの output embeddings: Dict[str, np.ndarray] key: 実験名 (exp001とか) value: embedding. shapeは ( len(test_df) x linear_out ) """ args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, []) print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True) data = DataFactory.load_data(config) data.test = test_df data, config = Pp.image_path(data, config) data, config = Pp.label_group_le(data, config) data, config = Pp.split_folds(data, config) data, config = Pp.kurupical_fold(data, config) if image_dir is not None: data.test["image_path"] = data.test["image"].map( lambda i: f"{image_dir}/{i}") features = [] img_features = [] txt_features = [] for epoch_config in config.inference_config.epoch_configs: _config = ConfigFactory.get_config_from_yaml_file( epoch_config.dataloader_exp, args.env, False) test_dataloader = DataLoaderFactory.get_test_dataloader( data, _config, num_workers=num_workers, batch_size=batch_size) _features, _img_features, _txt_features = InferenceFactory.epoch( args.env, epoch_config, test_dataloader, data) features += _features img_features += _img_features txt_features += _txt_features del _features del _img_features del _txt_features gc.collect() for i in range(len(features)): features[i] = np.concatenate(features[i]) img_features[i] = np.concatenate(img_features[i]) txt_features[i] = np.concatenate(txt_features[i]) print(f"features[{i}].shape:", features[i].shape) print(f"img_features[{i}].shape:", img_features[i].shape) print(f"txt_features[{i}].shape:", txt_features[i].shape) exps: List[str] = [] for epoch_config in config.inference_config.epoch_configs: for model_checkpoint in epoch_config.model_checkpoints: _exp = model_checkpoint.split("_")[0] exps.append(_exp) TestUtil.assert_any(len(exps), len(features)) return features[0], img_features[0], txt_features[0]
def get_kiccho_embeddings( exp: str, test_df: pd.DataFrame, num_workers: Optional[int] = None, batch_size: Optional[int] = None, image_dir: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ input exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く test_df: test.csvをpd.read_csvで読んだもの output all_embeddings, img_embeddings, text_embeddings """ args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, []) device = "cuda" print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, verbose=False) data = DataFactory.load_data(config) data.test = test_df.copy() data, config = Pp.image_path(data, config) data, config = Pp.label_group_le(data, config) data, config = Pp.split_folds(data, config) data, config = Pp.kurupical_fold(data, config) if image_dir is not None: data.test["image_path"] = data.test["image"].map( lambda i: f"{image_dir}/{i}") model_checkpoint = config.inference_config.epoch_configs[ 0].model_checkpoints[0] if args.env == EnvEnum.KAGGLE: model_checkpoint = model_checkpoint.replace("=", "") print("load model:", model_checkpoint) model = lit_models.ShopeeLitModel.load_from_checkpoint( str(config.dir_config.checkpoint_dir / model_checkpoint), data=data, config=config, fold=-1, with_mlflow=False, bert_path=str(config.dir_config.dataset_dir / config.model_config.bert_model_arch), is_test=True, ).model.to(device) model.eval() test_dataloader = DataLoaderFactory.get_test_dataloader( data, config, num_workers=num_workers, batch_size=batch_size) img_features = [] text_features = [] all_features = [] with torch.no_grad(): for batch in tqdm(test_dataloader): img = batch["img"].to(device) input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) all_feature, img_feature, text_feature = model( img, input_ids, attention_mask) all_features.extend(all_feature.detach().cpu().numpy().astype( np.float16)) img_features.extend(img_feature.detach().cpu().numpy().astype( np.float16)) text_features.extend(text_feature.detach().cpu().numpy().astype( np.float16)) img_features = np.array(img_features, dtype=np.float16) text_features = np.array(text_features, dtype=np.float16) all_features = np.array(all_features, dtype=np.float16) del data del model del test_dataloader.dataset del test_dataloader gc.collect() torch.cuda.empty_cache() return all_features, img_features, text_features