Пример #1
0
    def prepare_bert(env: EnvEnum, exp: str):
        config = ConfigFactory.get_config_from_yaml_file(exp, env, False)
        bert_dir = config.dir_config.dataset_dir / config.model_config.bert_model_arch
        if not bert_dir.exists():
            print(bert_dir)
            bert_model = AutoModel.from_pretrained(
                config.model_config.bert_model_arch)
            tokenizer = AutoTokenizer.from_pretrained(
                config.model_config.bert_model_arch)
            bert_model.save_pretrained(bert_dir)
            tokenizer.save_pretrained(bert_dir)

            # Test
            AutoModel.from_pretrained(bert_dir)
            AutoTokenizer.from_pretrained(bert_dir)
Пример #2
0
def load_model(checkpoint: str, env: EnvEnum, data: Data, fold: int):
    exp = checkpoint.split("_")[0]
    _checkpoint = FileUtil.get_best_cv_checkpoint(env, exp, fold)
    print("load model:", _checkpoint)
    config = ConfigFactory.get_config_from_yaml_file(exp, env, False)
    config.model_config.pretrained = False
    config.model_config.normalize = False
    lit_model: ShopeeLitModel = ShopeeLitModel.load_from_checkpoint(
        os.path.join(str(config.dir_config.checkpoint_dir), _checkpoint),
        data=data,
        config=config,
        fold=fold,
        with_mlflow=False,
    )
    return lit_model.model
def load_model(model_checkpoint: str):
    if args.env == EnvEnum.KAGGLE:
        model_checkpoint = model_checkpoint.replace("=", "")
    _exp = model_checkpoint.split("_")[0]
    _config = ConfigFactory.get_config_from_yaml_file(_exp, args.env, False)
    _config.model_config.pretrained = False
    lit_model: lit_models.ShopeeLitModel = (
        lit_models.ShopeeLitModel.load_from_checkpoint(
            str(config.dir_config.checkpoint_dir / model_checkpoint),
            data=data,
            config=_config,
            fold=-1,
            with_mlflow=False,
            bert_path=str(config.dir_config.input_dir /
                          "kaggle-shopee-dataset"),
        ))
    return lit_model.model
Пример #4
0
 def get_best_cv_checkpoint(env: EnvEnum, exp: str,
                            fold: int) -> Optional[str]:
     config = ConfigFactory.get_config_from_yaml_file(exp, env, False)
     checkpoints = [
         str(path).replace(str(config.dir_config.checkpoint_dir) + "/", "")
         for path in Path(config.dir_config.checkpoint_dir).glob(
             f"{exp}_{fold}_*")
     ]
     if len(checkpoints) == 0:
         print(f"{exp}_{fold}_*")
         return None
     max_acc_idx = np.argmax([
         re.findall(
             r"epoch(\d{1,2})"
             if env == EnvEnum.KAGGLE else r"epoch=(\d{1,2})",
             checkpoint,
         )[0] for checkpoint in checkpoints
     ])
     checkpoint = checkpoints[max_acc_idx]
     return checkpoint
def load_model(model_checkpoint: str):
    if args.env == EnvEnum.KAGGLE:
        model_checkpoint = model_checkpoint.replace("=", "")
    _exp = model_checkpoint.split("_")[0]
    _config = ConfigFactory.get_config_from_yaml_file(_exp, args.env, False)
    _config.model_config.pretrained = False
    _checkpoint_path = str(config.dir_config.checkpoint_dir / model_checkpoint)
    print("checkpoint:", _checkpoint_path)
    lit_model: lit_models.ShopeeLitModel = (
        lit_models.ShopeeLitModel.load_from_checkpoint(
            _checkpoint_path,
            data=data,
            config=_config,
            fold=-1,
            with_mlflow=False,
            bert_path=str(config.dir_config.input_dir /
                          "kaggle-shopee-dataset"),
        ))
    model_type = StringUtil.get_model_type(_config.model_config.model_name)
    model = lit_model.model
    del lit_model
    gc.collect()
    return model, model_type
def get_kiccho_embeddings(
    exp: str,
    test_df: pd.DataFrame,
    num_workers: Optional[int] = None,
    batch_size: Optional[int] = None,
    image_dir: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray, str, np.ndarray]:
    """
    input
        exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く
        test_df: test.csvをpd.read_csvで読んだもの

    output
        embeddings: Dict[str, np.ndarray]
            key: 実験名 (exp001とか)
            value: embedding. shapeは ( len(test_df) x linear_out )
    """
    args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, [])

    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True)
    data = DataFactory.load_data(config)

    data.test = test_df
    data, config = Pp.image_path(data, config)
    data, config = Pp.label_group_le(data, config)
    data, config = Pp.split_folds(data, config)
    data, config = Pp.kurupical_fold(data, config)

    if image_dir is not None:
        data.test["image_path"] = data.test["image"].map(
            lambda i: f"{image_dir}/{i}")

    features = []
    img_features = []
    txt_features = []
    for epoch_config in config.inference_config.epoch_configs:
        _config = ConfigFactory.get_config_from_yaml_file(
            epoch_config.dataloader_exp, args.env, False)
        test_dataloader = DataLoaderFactory.get_test_dataloader(
            data, _config, num_workers=num_workers, batch_size=batch_size)
        _features, _img_features, _txt_features = InferenceFactory.epoch(
            args.env, epoch_config, test_dataloader, data)
        features += _features
        img_features += _img_features
        txt_features += _txt_features
        del _features
        del _img_features
        del _txt_features
        gc.collect()
    for i in range(len(features)):
        features[i] = np.concatenate(features[i])
        img_features[i] = np.concatenate(img_features[i])
        txt_features[i] = np.concatenate(txt_features[i])
        print(f"features[{i}].shape:", features[i].shape)
        print(f"img_features[{i}].shape:", img_features[i].shape)
        print(f"txt_features[{i}].shape:", txt_features[i].shape)

    exps: List[str] = []
    for epoch_config in config.inference_config.epoch_configs:
        for model_checkpoint in epoch_config.model_checkpoints:
            _exp = model_checkpoint.split("_")[0]
            exps.append(_exp)

    TestUtil.assert_any(len(exps), len(features))

    return features[0], img_features[0], txt_features[0]
Пример #7
0
# %%
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).parents[2]))


from kaggle_shopee.factories.config_factory import ConfigFactory, EnvEnum
from kaggle_shopee.factories.data_factory import Data, DataFactory
from kaggle_shopee.factories.preprocessing import Pp
from kaggle_shopee.utils.args_util import ArgsUtil

args = ArgsUtil.get_args(EnvEnum.LOCAL, "exp003", [])
config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, False)
data = DataFactory.load_data(config)
data, config = Pp.main(data, config)

# %%
import re

import pandas as pd

pd.set_option("display.max_colwidth", None)
unit = [
    "GR",
    "GM",
    "KG",
    "KILO",
    "MG",
    "LITRE",
    "ML",
Пример #8
0
import torch
import torch.cuda
from kaggle_shopee.factories.config_factory import Config, ConfigFactory, EnvEnum
from kaggle_shopee.factories.data_factory import Data, DataFactory
from kaggle_shopee.factories.dataloader_factory import DataLoaderFactory
from kaggle_shopee.factories.inference_factory import InferenceFactory
from kaggle_shopee.factories.preprocessing import Pp
from kaggle_shopee.utils.args_util import ArgsUtil
from kaggle_shopee.utils.file_util import FileUtil
from kaggle_shopee.utils.global_util import GlobalUtil
from kaggle_shopee.utils.mlflow_util import MlflowUtil

args = ArgsUtil.get_args()
print(args)

config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True)
print(config.inference_config)
GlobalUtil.seed_everything(config.seed)
data = DataFactory.load_data(config)
data, config = Pp.main(data, config)

MlflowUtil.start_run(config.mlflow_config, config.exp, config.name, True)
MlflowUtil.log_params_config(config)
for fold in range(config.cv_config.n_splits):
    if fold not in args.folds:
        continue
    print(f"======================= fold {fold} =======================")
    features = []
    valid_df = data.train[data.train[config.cv_config.fold_col] == fold].copy()
    posting_ids = valid_df["posting_id"].values
    for i in range(len(config.inference_config.epoch_configs)):
Пример #9
0
    def __init__(
        self,
        config: Config,
        data: Data,
        out_features: int,
        train_df: pd.DataFrame = pd.DataFrame(),
        bert_path: Optional[str] = None,
    ):
        super(ShopeeImgTextNet2, self).__init__()
        self.config = config
        print("img_checkpoint:", config.model_config.img_checkpoint)
        print("txt_checkpoint:", config.model_config.txt_checkpoint)
        img_exp = config.model_config.img_checkpoint.split("_")[0]
        img_config = ConfigFactory.get_config_from_yaml_file(img_exp, config.env, False)
        img_config.model_config.pretrained = False
        img_config.model_config.normalize = False
        txt_exp = config.model_config.txt_checkpoint.split("_")[0]
        txt_config = ConfigFactory.get_config_from_yaml_file(txt_exp, config.env, False)
        txt_config.model_config.pretrained = False
        txt_config.model_config.normalize = False
        img_lit_model: ShopeeLitModel = ShopeeLitModel.load_from_checkpoint(
            os.path.join(
                str(config.dir_config.checkpoint_dir),
                config.model_config.img_checkpoint,
            ),
            data=data,
            config=img_config,
            fold=-1,
            with_mlflow=False,
        )
        txt_lit_model: ShopeeLitModel = ShopeeLitModel.load_from_checkpoint(
            os.path.join(
                str(config.dir_config.checkpoint_dir),
                config.model_config.txt_checkpoint,
            ),
            data=data,
            config=txt_config,
            fold=-1,
            with_mlflow=False,
            bert_path=bert_path,
        )
        self.img_model = img_lit_model.model
        self.txt_model = txt_lit_model.model
        img_out_features = list(self.img_model.children())[-2].num_features
        txt_out_features = list(self.txt_model.children())[-2].num_features
        print("img_out_features:", img_out_features)
        print("txt_out_features:", txt_out_features)
        concat_features = img_out_features + txt_out_features

        self.bn1 = nn.BatchNorm1d(concat_features)
        self.dropout = nn.Dropout(config.model_config.dropout)

        self.fc1 = nn.Linear(concat_features, config.model_config.channel_size)
        self.bn2 = nn.BatchNorm1d(config.model_config.channel_size)
        self._init_params()
        if config.met_config.name == "ArcAdaptiveMarginProduct":
            self.margin = MetricLearningFactory.get_metric_learning_product(
                config.met_config,
                in_features=config.model_config.channel_size,
                out_features=out_features,
                train_df=train_df,
            )
        else:
            self.margin = MetricLearningFactory.get_metric_learning_product(
                config.met_config,
                in_features=config.model_config.channel_size,
                out_features=out_features,
            )
Пример #10
0
    def __init__(
        self,
        config: Config,
        data: Data,
        out_features: int,
        train_df: pd.DataFrame = pd.DataFrame(),
        bert_path: Optional[str] = None,
        is_test: bool = False,
    ):
        super(ShopeeImgTextNet6, self).__init__()
        self.config = config
        print("img_checkpoint:", config.model_config.img_checkpoint)
        print("txt_checkpoint:", config.model_config.txt_checkpoint)

        img_exp = config.model_config.img_checkpoint.split("_")[0]
        img_config = ConfigFactory.get_config_from_yaml_file(img_exp, config.env, False)
        if is_test or config.model_config.with_pretrain:
            img_config.model_config.pretrained = False
        img_config.model_config.normalize = False
        self.img_model = img_models.ShopeeImgNet2(
            out_features,
            img_config.model_config,
            img_config.met_config,
            img_config.pooling_config,
        )
        self.img_margin = self.img_model.margin
        if "mixer_b" in img_config.model_config.model_arch:
            self.img_bn = nn.BatchNorm1d(768)
        elif "mixer_l" in img_config.model_config.model_arch:
            self.img_bn = nn.BatchNorm1d(1024)
        else:
            self.img_bn = nn.BatchNorm1d(self.img_model.backbone.num_features)

        txt_exp = config.model_config.txt_checkpoint.split("_")[0]
        txt_config = ConfigFactory.get_config_from_yaml_file(txt_exp, config.env, False)
        if is_test or config.model_config.with_pretrain:
            txt_config.model_config.pretrained = False
        txt_config.model_config.normalize = False
        self.txt_model = txt_models.ShopeeTextNet(
            out_features,
            txt_config.model_config,
            txt_config.met_config,
            txt_config.bert_pooling_config,
            bert_path,
        )
        self.txt_margin = self.txt_model.margin
        self.txt_bn = nn.BatchNorm1d(self.txt_model.bert_model.config.hidden_size)

        if "mixer_b" in img_config.model_config.model_arch:
            img_out_features = 768
        elif "mixer_l" in img_config.model_config.model_arch:
            img_out_features = 1024
        else:
            img_out_features = self.img_model.backbone.num_features
        txt_out_features = self.txt_model.bert_model.config.hidden_size
        print("img_out_features:", img_out_features)
        print("txt_out_features:", txt_out_features)
        concat_features = img_out_features + txt_out_features

        self.bn1 = nn.BatchNorm1d(concat_features)
        self.dropout = nn.Dropout(config.model_config.dropout)
        self.fc1 = nn.Linear(concat_features, config.model_config.channel_size)
        self.bn2 = nn.BatchNorm1d(config.model_config.channel_size)
        self._init_params()

        if config.met_config.name == "ArcAdaptiveMarginProduct":
            self.margin = MetricLearningFactory.get_metric_learning_product(
                config.met_config,
                in_features=config.model_config.channel_size,
                out_features=out_features,
                train_df=train_df,
            )
        else:
            self.margin = MetricLearningFactory.get_metric_learning_product(
                config.met_config,
                in_features=config.model_config.channel_size,
                out_features=out_features,
            )
        posting_ids = train_df["posting_id"].values
        batch_idxs = get_batch_idxs(len(posting_ids), 20)
        positives, positive_dict = get_positives(train_df)
        negatives = get_negatives(posting_ids, features, batch_idxs,
                                  positive_dict, num_negatives)
        positive_df = pd.DataFrame(positives,
                                   columns=["posting_id", "p_posting_id"])
        negative_df = pd.DataFrame(negatives,
                                   columns=["posting_id", "n_posting_id"])
        positive_dict = (positive_df.groupby("posting_id")
                         ["p_posting_id"].unique().to_dict())
        negative_dict = (negative_df.groupby("posting_id")
                         ["n_posting_id"].unique().to_dict())
        del positive_df
        del negative_df
        return positive_dict, negative_dict


if __name__ == "__main__":
    offline_mining_exp = "exp373"
    epoch = 9
    fold = 0
    num_negatives = 3

    args = ArgsUtil.get_args(EnvEnum.COLAB, "exp383", [0])
    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp, env=args.env)

    positive_dict, negative_dict = MiningFactory.get_triplets(
        config.dir_config, offline_mining_exp, epoch, fold, num_negatives)
Пример #12
0
 def load_config(env: EnvEnum, model_checkpoint: str) -> Config:
     exp = model_checkpoint.split("_")[0]
     config = ConfigFactory.get_config_from_yaml_file(exp, env, False)
     config.model_config.pretrained = False
     return config
Пример #13
0
def get_kiccho_embeddings(
    exp: str,
    test_df: pd.DataFrame,
    num_workers: Optional[int] = None,
    batch_size: Optional[int] = None,
    image_dir: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    input
        exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く
        test_df: test.csvをpd.read_csvで読んだもの

    output
        all_embeddings, img_embeddings, text_embeddings
    """
    args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, [])

    device = "cuda"

    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp,
                                                     args.env,
                                                     verbose=False)
    data = DataFactory.load_data(config)

    data.test = test_df.copy()
    data, config = Pp.image_path(data, config)
    data, config = Pp.label_group_le(data, config)
    data, config = Pp.split_folds(data, config)
    data, config = Pp.kurupical_fold(data, config)

    if image_dir is not None:
        data.test["image_path"] = data.test["image"].map(
            lambda i: f"{image_dir}/{i}")

    model_checkpoint = config.inference_config.epoch_configs[
        0].model_checkpoints[0]
    if args.env == EnvEnum.KAGGLE:
        model_checkpoint = model_checkpoint.replace("=", "")

    print("load model:", model_checkpoint)
    model = lit_models.ShopeeLitModel.load_from_checkpoint(
        str(config.dir_config.checkpoint_dir / model_checkpoint),
        data=data,
        config=config,
        fold=-1,
        with_mlflow=False,
        bert_path=str(config.dir_config.dataset_dir /
                      config.model_config.bert_model_arch),
        is_test=True,
    ).model.to(device)
    model.eval()

    test_dataloader = DataLoaderFactory.get_test_dataloader(
        data, config, num_workers=num_workers, batch_size=batch_size)

    img_features = []
    text_features = []
    all_features = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            img = batch["img"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            all_feature, img_feature, text_feature = model(
                img, input_ids, attention_mask)

            all_features.extend(all_feature.detach().cpu().numpy().astype(
                np.float16))
            img_features.extend(img_feature.detach().cpu().numpy().astype(
                np.float16))
            text_features.extend(text_feature.detach().cpu().numpy().astype(
                np.float16))

    img_features = np.array(img_features, dtype=np.float16)
    text_features = np.array(text_features, dtype=np.float16)
    all_features = np.array(all_features, dtype=np.float16)

    del data
    del model
    del test_dataloader.dataset
    del test_dataloader
    gc.collect()
    torch.cuda.empty_cache()
    return all_features, img_features, text_features
    def main(data: Data, config: Config) -> Tuple[Data, Config]:
        data, config = FE.title_tfidf_reduced(data, config)
        return data, config


args = ArgsUtil.get_args(env=EnvEnum.LOCAL, exp="exp001")

yaml_str = """
    exp: exp001
    seed: 77

    title_tfidf_n_components: 50
    title_tfidf_reducer: PCA
"""

config = ConfigFactory.get_config_from_yaml_str(yaml_str)
data = DataFactory.load_data(config)
data, config = Pp.main(data, config)

# %%
ImgUtil.show_img(data.train["image_path"].iloc[2])

# %%
data, config = FE.main(data, config)

# %%
nbrs = NearestNeighbors(n_neighbors=5, algorithm="ball_tree")
x_train = data.train[[
    f"title_tfidf_{i}" for i in range(config.title_tfidf_n_components)
]].values
nbrs.fit(x_train)
Пример #15
0
import numpy as np
import pandas as pd
from kaggle_shopee.factories.config_factory import ConfigFactory
from kaggle_shopee.utils.args_util import ArgsUtil
from kaggle_shopee.utils.global_util import GlobalUtil
from kaggle_shopee.utils.metric_util import MetricUtil
from kaggle_shopee.utils.mlflow_util import MlflowUtil
from kaggle_shopee.utils.test_util import TestUtil

args = ArgsUtil.get_args()
print(args)

assert args.exp.startswith("eexp"), "{} does not start with eexp".format(
    args.exp)
config = ConfigFactory.get_config_from_yaml_file(args.exp,
                                                 env=args.env,
                                                 verbose=False)
print(config.dir_config)
GlobalUtil.seed_everything(config.seed)

exps = [m.split("_")[0] for m in config.inference_config.model_checkpoints]
score_means = []
MlflowUtil.start_run(config.mlflow_config, config.exp, config.name, True)
MlflowUtil.log_params_e_config(config)
for fold in range(5):
    if fold not in args.folds:
        continue

    y_preds: List[List[List[str]]] = []
    dfs: List[pd.DataFrame] = [
        pd.read_csv(config.dir_config.output_dir.parent / exp /
def load_config(model_checkpoint: str) -> Config:
    _exp = model_checkpoint.split("_")[0]
    _config = ConfigFactory.get_config_from_yaml_file(_exp, args.env, False)
    _config.model_config.pretrained = False
    return _config