def prepare_bert(env: EnvEnum, exp: str): config = ConfigFactory.get_config_from_yaml_file(exp, env, False) bert_dir = config.dir_config.dataset_dir / config.model_config.bert_model_arch if not bert_dir.exists(): print(bert_dir) bert_model = AutoModel.from_pretrained( config.model_config.bert_model_arch) tokenizer = AutoTokenizer.from_pretrained( config.model_config.bert_model_arch) bert_model.save_pretrained(bert_dir) tokenizer.save_pretrained(bert_dir) # Test AutoModel.from_pretrained(bert_dir) AutoTokenizer.from_pretrained(bert_dir)
def load_model(checkpoint: str, env: EnvEnum, data: Data, fold: int): exp = checkpoint.split("_")[0] _checkpoint = FileUtil.get_best_cv_checkpoint(env, exp, fold) print("load model:", _checkpoint) config = ConfigFactory.get_config_from_yaml_file(exp, env, False) config.model_config.pretrained = False config.model_config.normalize = False lit_model: ShopeeLitModel = ShopeeLitModel.load_from_checkpoint( os.path.join(str(config.dir_config.checkpoint_dir), _checkpoint), data=data, config=config, fold=fold, with_mlflow=False, ) return lit_model.model
def load_model(model_checkpoint: str): if args.env == EnvEnum.KAGGLE: model_checkpoint = model_checkpoint.replace("=", "") _exp = model_checkpoint.split("_")[0] _config = ConfigFactory.get_config_from_yaml_file(_exp, args.env, False) _config.model_config.pretrained = False lit_model: lit_models.ShopeeLitModel = ( lit_models.ShopeeLitModel.load_from_checkpoint( str(config.dir_config.checkpoint_dir / model_checkpoint), data=data, config=_config, fold=-1, with_mlflow=False, bert_path=str(config.dir_config.input_dir / "kaggle-shopee-dataset"), )) return lit_model.model
def get_best_cv_checkpoint(env: EnvEnum, exp: str, fold: int) -> Optional[str]: config = ConfigFactory.get_config_from_yaml_file(exp, env, False) checkpoints = [ str(path).replace(str(config.dir_config.checkpoint_dir) + "/", "") for path in Path(config.dir_config.checkpoint_dir).glob( f"{exp}_{fold}_*") ] if len(checkpoints) == 0: print(f"{exp}_{fold}_*") return None max_acc_idx = np.argmax([ re.findall( r"epoch(\d{1,2})" if env == EnvEnum.KAGGLE else r"epoch=(\d{1,2})", checkpoint, )[0] for checkpoint in checkpoints ]) checkpoint = checkpoints[max_acc_idx] return checkpoint
def load_model(model_checkpoint: str): if args.env == EnvEnum.KAGGLE: model_checkpoint = model_checkpoint.replace("=", "") _exp = model_checkpoint.split("_")[0] _config = ConfigFactory.get_config_from_yaml_file(_exp, args.env, False) _config.model_config.pretrained = False _checkpoint_path = str(config.dir_config.checkpoint_dir / model_checkpoint) print("checkpoint:", _checkpoint_path) lit_model: lit_models.ShopeeLitModel = ( lit_models.ShopeeLitModel.load_from_checkpoint( _checkpoint_path, data=data, config=_config, fold=-1, with_mlflow=False, bert_path=str(config.dir_config.input_dir / "kaggle-shopee-dataset"), )) model_type = StringUtil.get_model_type(_config.model_config.model_name) model = lit_model.model del lit_model gc.collect() return model, model_type
def get_kiccho_embeddings( exp: str, test_df: pd.DataFrame, num_workers: Optional[int] = None, batch_size: Optional[int] = None, image_dir: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray, str, np.ndarray]: """ input exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く test_df: test.csvをpd.read_csvで読んだもの output embeddings: Dict[str, np.ndarray] key: 実験名 (exp001とか) value: embedding. shapeは ( len(test_df) x linear_out ) """ args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, []) print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True) data = DataFactory.load_data(config) data.test = test_df data, config = Pp.image_path(data, config) data, config = Pp.label_group_le(data, config) data, config = Pp.split_folds(data, config) data, config = Pp.kurupical_fold(data, config) if image_dir is not None: data.test["image_path"] = data.test["image"].map( lambda i: f"{image_dir}/{i}") features = [] img_features = [] txt_features = [] for epoch_config in config.inference_config.epoch_configs: _config = ConfigFactory.get_config_from_yaml_file( epoch_config.dataloader_exp, args.env, False) test_dataloader = DataLoaderFactory.get_test_dataloader( data, _config, num_workers=num_workers, batch_size=batch_size) _features, _img_features, _txt_features = InferenceFactory.epoch( args.env, epoch_config, test_dataloader, data) features += _features img_features += _img_features txt_features += _txt_features del _features del _img_features del _txt_features gc.collect() for i in range(len(features)): features[i] = np.concatenate(features[i]) img_features[i] = np.concatenate(img_features[i]) txt_features[i] = np.concatenate(txt_features[i]) print(f"features[{i}].shape:", features[i].shape) print(f"img_features[{i}].shape:", img_features[i].shape) print(f"txt_features[{i}].shape:", txt_features[i].shape) exps: List[str] = [] for epoch_config in config.inference_config.epoch_configs: for model_checkpoint in epoch_config.model_checkpoints: _exp = model_checkpoint.split("_")[0] exps.append(_exp) TestUtil.assert_any(len(exps), len(features)) return features[0], img_features[0], txt_features[0]
# %% import sys from pathlib import Path sys.path.append(str(Path(__file__).parents[2])) from kaggle_shopee.factories.config_factory import ConfigFactory, EnvEnum from kaggle_shopee.factories.data_factory import Data, DataFactory from kaggle_shopee.factories.preprocessing import Pp from kaggle_shopee.utils.args_util import ArgsUtil args = ArgsUtil.get_args(EnvEnum.LOCAL, "exp003", []) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, False) data = DataFactory.load_data(config) data, config = Pp.main(data, config) # %% import re import pandas as pd pd.set_option("display.max_colwidth", None) unit = [ "GR", "GM", "KG", "KILO", "MG", "LITRE", "ML",
import torch import torch.cuda from kaggle_shopee.factories.config_factory import Config, ConfigFactory, EnvEnum from kaggle_shopee.factories.data_factory import Data, DataFactory from kaggle_shopee.factories.dataloader_factory import DataLoaderFactory from kaggle_shopee.factories.inference_factory import InferenceFactory from kaggle_shopee.factories.preprocessing import Pp from kaggle_shopee.utils.args_util import ArgsUtil from kaggle_shopee.utils.file_util import FileUtil from kaggle_shopee.utils.global_util import GlobalUtil from kaggle_shopee.utils.mlflow_util import MlflowUtil args = ArgsUtil.get_args() print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True) print(config.inference_config) GlobalUtil.seed_everything(config.seed) data = DataFactory.load_data(config) data, config = Pp.main(data, config) MlflowUtil.start_run(config.mlflow_config, config.exp, config.name, True) MlflowUtil.log_params_config(config) for fold in range(config.cv_config.n_splits): if fold not in args.folds: continue print(f"======================= fold {fold} =======================") features = [] valid_df = data.train[data.train[config.cv_config.fold_col] == fold].copy() posting_ids = valid_df["posting_id"].values for i in range(len(config.inference_config.epoch_configs)):
def __init__( self, config: Config, data: Data, out_features: int, train_df: pd.DataFrame = pd.DataFrame(), bert_path: Optional[str] = None, ): super(ShopeeImgTextNet2, self).__init__() self.config = config print("img_checkpoint:", config.model_config.img_checkpoint) print("txt_checkpoint:", config.model_config.txt_checkpoint) img_exp = config.model_config.img_checkpoint.split("_")[0] img_config = ConfigFactory.get_config_from_yaml_file(img_exp, config.env, False) img_config.model_config.pretrained = False img_config.model_config.normalize = False txt_exp = config.model_config.txt_checkpoint.split("_")[0] txt_config = ConfigFactory.get_config_from_yaml_file(txt_exp, config.env, False) txt_config.model_config.pretrained = False txt_config.model_config.normalize = False img_lit_model: ShopeeLitModel = ShopeeLitModel.load_from_checkpoint( os.path.join( str(config.dir_config.checkpoint_dir), config.model_config.img_checkpoint, ), data=data, config=img_config, fold=-1, with_mlflow=False, ) txt_lit_model: ShopeeLitModel = ShopeeLitModel.load_from_checkpoint( os.path.join( str(config.dir_config.checkpoint_dir), config.model_config.txt_checkpoint, ), data=data, config=txt_config, fold=-1, with_mlflow=False, bert_path=bert_path, ) self.img_model = img_lit_model.model self.txt_model = txt_lit_model.model img_out_features = list(self.img_model.children())[-2].num_features txt_out_features = list(self.txt_model.children())[-2].num_features print("img_out_features:", img_out_features) print("txt_out_features:", txt_out_features) concat_features = img_out_features + txt_out_features self.bn1 = nn.BatchNorm1d(concat_features) self.dropout = nn.Dropout(config.model_config.dropout) self.fc1 = nn.Linear(concat_features, config.model_config.channel_size) self.bn2 = nn.BatchNorm1d(config.model_config.channel_size) self._init_params() if config.met_config.name == "ArcAdaptiveMarginProduct": self.margin = MetricLearningFactory.get_metric_learning_product( config.met_config, in_features=config.model_config.channel_size, out_features=out_features, train_df=train_df, ) else: self.margin = MetricLearningFactory.get_metric_learning_product( config.met_config, in_features=config.model_config.channel_size, out_features=out_features, )
def __init__( self, config: Config, data: Data, out_features: int, train_df: pd.DataFrame = pd.DataFrame(), bert_path: Optional[str] = None, is_test: bool = False, ): super(ShopeeImgTextNet6, self).__init__() self.config = config print("img_checkpoint:", config.model_config.img_checkpoint) print("txt_checkpoint:", config.model_config.txt_checkpoint) img_exp = config.model_config.img_checkpoint.split("_")[0] img_config = ConfigFactory.get_config_from_yaml_file(img_exp, config.env, False) if is_test or config.model_config.with_pretrain: img_config.model_config.pretrained = False img_config.model_config.normalize = False self.img_model = img_models.ShopeeImgNet2( out_features, img_config.model_config, img_config.met_config, img_config.pooling_config, ) self.img_margin = self.img_model.margin if "mixer_b" in img_config.model_config.model_arch: self.img_bn = nn.BatchNorm1d(768) elif "mixer_l" in img_config.model_config.model_arch: self.img_bn = nn.BatchNorm1d(1024) else: self.img_bn = nn.BatchNorm1d(self.img_model.backbone.num_features) txt_exp = config.model_config.txt_checkpoint.split("_")[0] txt_config = ConfigFactory.get_config_from_yaml_file(txt_exp, config.env, False) if is_test or config.model_config.with_pretrain: txt_config.model_config.pretrained = False txt_config.model_config.normalize = False self.txt_model = txt_models.ShopeeTextNet( out_features, txt_config.model_config, txt_config.met_config, txt_config.bert_pooling_config, bert_path, ) self.txt_margin = self.txt_model.margin self.txt_bn = nn.BatchNorm1d(self.txt_model.bert_model.config.hidden_size) if "mixer_b" in img_config.model_config.model_arch: img_out_features = 768 elif "mixer_l" in img_config.model_config.model_arch: img_out_features = 1024 else: img_out_features = self.img_model.backbone.num_features txt_out_features = self.txt_model.bert_model.config.hidden_size print("img_out_features:", img_out_features) print("txt_out_features:", txt_out_features) concat_features = img_out_features + txt_out_features self.bn1 = nn.BatchNorm1d(concat_features) self.dropout = nn.Dropout(config.model_config.dropout) self.fc1 = nn.Linear(concat_features, config.model_config.channel_size) self.bn2 = nn.BatchNorm1d(config.model_config.channel_size) self._init_params() if config.met_config.name == "ArcAdaptiveMarginProduct": self.margin = MetricLearningFactory.get_metric_learning_product( config.met_config, in_features=config.model_config.channel_size, out_features=out_features, train_df=train_df, ) else: self.margin = MetricLearningFactory.get_metric_learning_product( config.met_config, in_features=config.model_config.channel_size, out_features=out_features, )
posting_ids = train_df["posting_id"].values batch_idxs = get_batch_idxs(len(posting_ids), 20) positives, positive_dict = get_positives(train_df) negatives = get_negatives(posting_ids, features, batch_idxs, positive_dict, num_negatives) positive_df = pd.DataFrame(positives, columns=["posting_id", "p_posting_id"]) negative_df = pd.DataFrame(negatives, columns=["posting_id", "n_posting_id"]) positive_dict = (positive_df.groupby("posting_id") ["p_posting_id"].unique().to_dict()) negative_dict = (negative_df.groupby("posting_id") ["n_posting_id"].unique().to_dict()) del positive_df del negative_df return positive_dict, negative_dict if __name__ == "__main__": offline_mining_exp = "exp373" epoch = 9 fold = 0 num_negatives = 3 args = ArgsUtil.get_args(EnvEnum.COLAB, "exp383", [0]) print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, env=args.env) positive_dict, negative_dict = MiningFactory.get_triplets( config.dir_config, offline_mining_exp, epoch, fold, num_negatives)
def load_config(env: EnvEnum, model_checkpoint: str) -> Config: exp = model_checkpoint.split("_")[0] config = ConfigFactory.get_config_from_yaml_file(exp, env, False) config.model_config.pretrained = False return config
def get_kiccho_embeddings( exp: str, test_df: pd.DataFrame, num_workers: Optional[int] = None, batch_size: Optional[int] = None, image_dir: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ input exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く test_df: test.csvをpd.read_csvで読んだもの output all_embeddings, img_embeddings, text_embeddings """ args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, []) device = "cuda" print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, verbose=False) data = DataFactory.load_data(config) data.test = test_df.copy() data, config = Pp.image_path(data, config) data, config = Pp.label_group_le(data, config) data, config = Pp.split_folds(data, config) data, config = Pp.kurupical_fold(data, config) if image_dir is not None: data.test["image_path"] = data.test["image"].map( lambda i: f"{image_dir}/{i}") model_checkpoint = config.inference_config.epoch_configs[ 0].model_checkpoints[0] if args.env == EnvEnum.KAGGLE: model_checkpoint = model_checkpoint.replace("=", "") print("load model:", model_checkpoint) model = lit_models.ShopeeLitModel.load_from_checkpoint( str(config.dir_config.checkpoint_dir / model_checkpoint), data=data, config=config, fold=-1, with_mlflow=False, bert_path=str(config.dir_config.dataset_dir / config.model_config.bert_model_arch), is_test=True, ).model.to(device) model.eval() test_dataloader = DataLoaderFactory.get_test_dataloader( data, config, num_workers=num_workers, batch_size=batch_size) img_features = [] text_features = [] all_features = [] with torch.no_grad(): for batch in tqdm(test_dataloader): img = batch["img"].to(device) input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) all_feature, img_feature, text_feature = model( img, input_ids, attention_mask) all_features.extend(all_feature.detach().cpu().numpy().astype( np.float16)) img_features.extend(img_feature.detach().cpu().numpy().astype( np.float16)) text_features.extend(text_feature.detach().cpu().numpy().astype( np.float16)) img_features = np.array(img_features, dtype=np.float16) text_features = np.array(text_features, dtype=np.float16) all_features = np.array(all_features, dtype=np.float16) del data del model del test_dataloader.dataset del test_dataloader gc.collect() torch.cuda.empty_cache() return all_features, img_features, text_features
def main(data: Data, config: Config) -> Tuple[Data, Config]: data, config = FE.title_tfidf_reduced(data, config) return data, config args = ArgsUtil.get_args(env=EnvEnum.LOCAL, exp="exp001") yaml_str = """ exp: exp001 seed: 77 title_tfidf_n_components: 50 title_tfidf_reducer: PCA """ config = ConfigFactory.get_config_from_yaml_str(yaml_str) data = DataFactory.load_data(config) data, config = Pp.main(data, config) # %% ImgUtil.show_img(data.train["image_path"].iloc[2]) # %% data, config = FE.main(data, config) # %% nbrs = NearestNeighbors(n_neighbors=5, algorithm="ball_tree") x_train = data.train[[ f"title_tfidf_{i}" for i in range(config.title_tfidf_n_components) ]].values nbrs.fit(x_train)
import numpy as np import pandas as pd from kaggle_shopee.factories.config_factory import ConfigFactory from kaggle_shopee.utils.args_util import ArgsUtil from kaggle_shopee.utils.global_util import GlobalUtil from kaggle_shopee.utils.metric_util import MetricUtil from kaggle_shopee.utils.mlflow_util import MlflowUtil from kaggle_shopee.utils.test_util import TestUtil args = ArgsUtil.get_args() print(args) assert args.exp.startswith("eexp"), "{} does not start with eexp".format( args.exp) config = ConfigFactory.get_config_from_yaml_file(args.exp, env=args.env, verbose=False) print(config.dir_config) GlobalUtil.seed_everything(config.seed) exps = [m.split("_")[0] for m in config.inference_config.model_checkpoints] score_means = [] MlflowUtil.start_run(config.mlflow_config, config.exp, config.name, True) MlflowUtil.log_params_e_config(config) for fold in range(5): if fold not in args.folds: continue y_preds: List[List[List[str]]] = [] dfs: List[pd.DataFrame] = [ pd.read_csv(config.dir_config.output_dir.parent / exp /
def load_config(model_checkpoint: str) -> Config: _exp = model_checkpoint.split("_")[0] _config = ConfigFactory.get_config_from_yaml_file(_exp, args.env, False) _config.model_config.pretrained = False return _config