def title_tfidf_reduced(data: Data, config: Config) -> Tuple[Data, Config]: tfidf = TfidfVectorizer(min_df=10) _df = pd.concat([data.train, data.test], axis=0).reset_index(drop=True) tfidf.fit(_df["title"]) _train_encoded = tfidf.transform(data.train["title"]).todense() _test_encoded = tfidf.transform(data.test["title"]).todense() _encoded = np.concatenate([_train_encoded, _test_encoded], axis=0) TestUtil.assert_any(_train_encoded.shape[1], _test_encoded.shape[1]) TestUtil.assert_any(_train_encoded.shape[1], _encoded.shape[1]) if config.title_tfidf_reducer == ReducerEnum.NOTHING: config.title_tfidf_n_components = _encoded.shape[1] for i in tqdm(range(config.title_tfidf_n_components)): data.train[f"title_tfidf_{i}"] = _train_encoded[:, i] data.test[f"title_tfidf_{i}"] = _test_encoded[:, i] return data, config if config.title_tfidf_reducer == ReducerEnum.PCA: reduce_model = PCA(n_components=config.title_tfidf_n_components) else: reduce_model = PCA(n_components=config.title_tfidf_n_components) reduce_model.fit(_encoded) _train_reduced = reduce_model.transform(_train_encoded) _test_reduced = reduce_model.transform(_test_encoded) TestUtil.assert_any(_train_reduced.shape[1], _test_reduced.shape[1]) TestUtil.assert_any(config.title_tfidf_n_components, _train_reduced.shape[1]) for i in tqdm(range(config.title_tfidf_n_components)): data.train[f"title_tfidf_{i}"] = _train_reduced[:, i] data.test[f"title_tfidf_{i}"] = _test_reduced[:, i] return data, config
def image_path(data: Data, config: Config) -> Tuple[Data, Config]: data.train["image_path"] = data.train["image"].map( lambda i: str(config.dir_config.train_images_dir / i) ) data.test["image_path"] = data.test["image"].map( lambda i: str(config.dir_config.test_images_dir / i) ) return data, config
def kurupical_fold(data: Data, config: Config) -> Tuple[Data, Config]: if config.env != EnvEnum.KAGGLE: data.train = data.train.merge( data.train_fold[["posting_id", "fold"]].rename({"fold": "kurupical_fold"}, axis=1), on="posting_id", ) else: data.train["kurupical_fold"] = data.train["fold"] return data, config
def title_preprocessed(data: Data, config: Config) -> Tuple[Data, Config]: def string_escape(s, encoding="utf-8"): return (s.encode( "latin1") # To bytes, required by 'unicode-escape' .decode("unicode-escape" ) # Perform the actual octal-escaping decode .encode("latin1") # 1:1 mapping back to bytes .decode(encoding)) # Decode original encoding data.train["title_preprocessed"] = data.train["title"].map( string_escape) data.test["title_preprocessed"] = data.test["title"].map(string_escape) return data, config
def split_folds(data: Data, config: Config) -> Tuple[Data, Config]: folds = GroupKFold(n_splits=config.cv_config.n_splits) data.train["fold"] = -1 for fold, (_, valid_idx) in enumerate( folds.split(data.train, None, data.train["label_group"])): data.train.loc[valid_idx, "fold"] = fold return data, config
def image_phash_match_posting_ids( data: Data, config: Config ) -> Tuple[Data, Config]: _map = data.train.groupby("image_phash")["posting_id"].unique() data.train["image_phash_match_posting_ids"] = data.train["image_phash"].map( _map ) return data, config
def shuffle(data: Data, config: Config) -> Tuple[Data, Config]: data.train = data.train.sample( frac=1, random_state=config.seed).reset_index(drop=True) return data, config
def title_num_str(data: Data, config: Config) -> Tuple[Data, Config]: data.train = title_number_to_str(data.train) data.test = title_number_to_str(data.test) return data, config
def target(data: Data, config: Config) -> Tuple[Data, Config]: tmp = data.train.groupby("label_group")["posting_id"].unique() data.train["target"] = data.train["label_group"].map(tmp) return data, config
def label_group_le(data: Data, config: Config) -> Tuple[Data, Config]: le = LabelEncoder() data.train["label_group_le"] = le.fit_transform( data.train["label_group"]) return data, config