def intersect_drop_columns(train: csr_matrix, valid: csr_matrix, min_df=0): t = train.tocsc() v = valid.tocsc() nnz_train = ((t != 0).sum(axis=0) >= min_df).A1 nnz_valid = ((v != 0).sum(axis=0) >= min_df).A1 nnz_cols = nnz_train & nnz_valid res = t[:, nnz_cols], v[:, nnz_cols] return res
def train_thresholding(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str): """Trains a linear model for multilabel data using a one-vs-rest strategy and cross-validation to pick an optimal decision threshold for Macro-F1. Outperforms train_1vsrest in most aspects at the cost of higher time complexity. See user guide for more details. Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str): The option string passed to liblinear. Returns: A model which can be used in predict_values. """ # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ x, options, bias = prepare_options(x, options) y = y.tocsc() num_class = y.shape[1] num_feature = x.shape[1] weights = np.zeros((num_feature, num_class), order='F') thresholds = np.zeros(num_class) for i in range(num_class): yi = y[:, i].toarray().reshape(-1) w, t = thresholding_one_label(2 * yi - 1, x, options) weights[:, i] = w.ravel() thresholds[i] = t return { 'weights': np.asmatrix(weights), '-B': bias, 'threshold': thresholds }
def test_tversky_index(X: sps.csr_matrix, alpha: float, beta: float, shrinkage: float) -> None: RNS = np.random.RandomState(0) rec = TverskyIndexKNNRecommender(X, shrinkage=shrinkage, alpha=alpha, beta=beta, n_threads=1, top_k=X.shape[1]) rec.learn() sim = rec.W.toarray() tested_index_row = RNS.randint(0, sim.shape[0], size=100) tested_index_col = RNS.randint(0, sim.shape[0], size=100) X_csc = X.tocsc() X_csc.sorted_indices() for i, j in zip(tested_index_row, tested_index_col): if i == j: continue computed = sim[i, j] U_i = set(X_csc[:, i].nonzero()[0]) U_j = set(X_csc[:, j].nonzero()[0]) intersect = U_i.intersection(U_j) Ui_minus_Uj = U_i.difference(U_j) Uj_minus_Ui = U_j.difference(U_i) target = len(intersect) / (len(intersect) + alpha * len(Ui_minus_Uj) + beta * len(Uj_minus_Ui) + shrinkage + 1e-6) assert computed == pytest.approx(target)
def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str): """Trains a linear model for multiabel data using a one-vs-rest strategy. Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str): The option string passed to liblinear. Returns: A model which can be used in predict_values. """ # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ x, options, bias = prepare_options(x, options) y = y.tocsc() num_class = y.shape[1] num_feature = x.shape[1] weights = np.zeros((num_feature, num_class), order='F') for i in range(num_class): yi = y[:, i].toarray().reshape(-1) modeli = train(2 * yi - 1, x, options) w = np.ctypeslib.as_array(modeli.w, (num_feature, )) # Liblinear flips +1/-1 labels so +1 is always the first label, # but not if all labels are -1. # For our usage, we need +1 to always be the first label, # so the check is necessary. if modeli.get_labels()[0] == -1: weights[:, i] = -w else: weights[:, i] = w return {'weights': np.asmatrix(weights), '-B': bias, 'threshold': 0}
def format_URM_positive_user_compressed(URM: csr_matrix): """ Format positive interactions of an URM in the way that is needed for the FM model. Here, however, users information are grouped w.r.t. items, meaning that, we will have: - We have #warm_items @row - We have #users+items+1 @cols - We have #(interactions)+(warm_items*2) @data Each row is representing a warm item and all users that interacted with that item are stored in that row. :param URM: URM to be preprocessed :return: preprocessed URM in sparse matrix csr format """ warm_items_mask = np.ediff1d(URM.tocsc().indptr) > 0 warm_items = np.arange(URM.shape[1])[warm_items_mask] new_train = URM.copy().tocoo() fm_matrix = coo_matrix((warm_items.size, URM.shape[0] + URM.shape[1] + 1), dtype=np.int8) # Index offset item_offset = URM.shape[0] # Set up initial vectors row_v = np.zeros(new_train.data.size + (warm_items.size * 2)) col_v = np.zeros(new_train.data.size + (warm_items.size * 2)) data_v = np.zeros(new_train.data.size + (warm_items.size * 2)) # Already ok, nothing to be added # For all the items, set up its content j = 0 # Index to scan and modify the vectors URM_train_csc = URM.copy().tocsc() for i, item in enumerate(warm_items): # Find all users who liked that item users_who_liked_item = URM_train_csc[:, item].indices offset = users_who_liked_item.size if offset > 0: col_v[j:j + offset] = users_who_liked_item row_v[j:j + offset] = i data_v[j:j + offset] = 1 col_v[j + offset] = item + item_offset row_v[j + offset] = i data_v[j + offset] = 1 col_v[j + offset + 1] = fm_matrix.shape[1] - 1 row_v[j + offset + 1] = i data_v[j + offset + 1] = 1 j = j + offset + 2 else: raise RuntimeError("Illegal state") # Setting new information fm_matrix.row = row_v fm_matrix.col = col_v fm_matrix.data = data_v return fm_matrix.tocsr()
def train_cost_sensitive_micro(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str): """Trains a linear model for multilabel data using a one-vs-rest strategy and cross-validation to pick an optimal asymmetric misclassification cost for Micro-F1. Outperforms train_1vsrest in most aspects at the cost of higher time complexity. See user guide for more details. Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str): The option string passed to liblinear. Returns: A model which can be used in predict_values. """ # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ x, options, bias = prepare_options(x, options) y = y.tocsc() num_class = y.shape[1] num_feature = x.shape[1] weights = np.zeros((num_feature, num_class), order='F') l = y.shape[0] perm = np.random.permutation(l) param_space = [1, 1.33, 1.8, 2.5, 3.67, 6, 13] bestScore = -np.Inf for a in param_space: tp = fn = fp = 0 for i in range(num_class): yi = y[:, i].toarray().reshape(-1) yi = 2 * yi - 1 cv_options = f'{options} -w1 {a}' pred = cross_validate(yi, x, cv_options, perm) tp = tp + np.sum(np.logical_and(yi == 1, pred == 1)) fn = fn + np.sum(np.logical_and(yi == 1, pred == -1)) fp = fp + np.sum(np.logical_and(yi == -1, pred == 1)) score = 2 * tp / (2 * tp + fn + fp) if bestScore < score: bestScore = score bestA = a final_options = f'{options} -w1 {bestA}' for i in range(num_class): yi = y[:, i].toarray().reshape(-1) w = do_train(2 * yi - 1, x, final_options) weights[:, i] = w.ravel() return {'weights': np.asmatrix(weights), '-B': bias, 'threshold': 0}
def _preprocess_URM_all(self, URM_all: sps.csr_matrix): warm_items_mask = np.ediff1d(URM_all.tocsc().indptr) > self.threshold_items self.warm_items = np.arange(URM_all.shape[1])[warm_items_mask] URM_all = URM_all[:, self.warm_items] warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > self.threshold_users self.warm_users = np.arange(URM_all.shape[0])[warm_users_mask] URM_all = URM_all[self.warm_users, :] self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = reconcile_mapper_with_removed_tokens( self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"], np.arange(0, len(warm_users_mask), dtype=np.int)[np.logical_not(warm_users_mask)]) self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"] = reconcile_mapper_with_removed_tokens( self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"], np.arange(0, len(warm_items_mask), dtype=np.int)[np.logical_not(warm_items_mask)]) return URM_all
def advanced_subclass_handling(data_frame: pd.DataFrame, URM_train: csr_matrix, path="../../data/", add_subclass=False): """ Here we want to include in the training set sub class information in the following way: - A column encoding the mean of 'label' for a certain couple (user, subclass): i.e. how many items of that subclass the user liked - Including information about the popularity of the subclass (how many items for that subclass - Including ratings of that subclass :param URM_train: mean response will be retrieved from here :param data_frame: dataframe being pre-processed for boosting :param path: path to the folder containing subclass dataframe :return: dataframe with augmented information """ print("Adding subclass and feature engineering subclass...") data_frame = data_frame.copy() df_subclass: pd.DataFrame = pd.read_csv(path + "data_ICM_sub_class.csv") df_subclass = df_subclass[['row', 'col']] df_subclass = df_subclass.rename(columns={"col": "subclass"}) # Merging sub class information data_frame = pd.merge(data_frame, df_subclass, right_on="row", left_on="item_id") data_frame = data_frame.drop(columns=["row"], inplace=False) print("\t- Add items present for each subclass") # Add subclass item-popularity: how many items are present of that subclass subclass_item_count = df_subclass.groupby("subclass").count() data_frame = pd.merge(data_frame, subclass_item_count, right_index=True, left_on="subclass") data_frame = data_frame.rename(columns={"row": "item_per_subclass"}) print("\t- Add ratings popularity for each subclass") # Add subclass ratings-popularity: how many interactions we have for each subclass URM_train_csc = URM_train.tocsc() n_ratings_sub = [] sorted_sub_indices = np.argsort(df_subclass['subclass'].values) sorted_sub = df_subclass['subclass'][sorted_sub_indices].values sorted_item_subclass = df_subclass['row'][sorted_sub_indices].values unique_sorted_sub, sub_indptr = np.unique(sorted_sub, return_index=True) sub_indptr = np.concatenate([sub_indptr, [sorted_sub.size]]) for i, sub in tqdm(enumerate(unique_sorted_sub), total=unique_sorted_sub.size, desc="\t\tProcessing"): item_sub = sorted_item_subclass[sub_indptr[i]:sub_indptr[i + 1]] n_ratings_sub.append(URM_train_csc[:, item_sub].data.size) ratings_sub = np.array([unique_sorted_sub, n_ratings_sub]) ratings_per_sub_df = pd.DataFrame( data=np.transpose(ratings_sub), columns=["subclass", "global_ratings_per_subclass"]) data_frame = pd.merge(data_frame, ratings_per_sub_df, left_on="subclass", right_on="subclass") # Add subclass ratings-popularity for each user using rating percentage print("\t- Add ratings popularity for pairs (user, subclass)") users = data_frame['user_id'].values sub = data_frame['subclass'].values perc_array = np.zeros(users.size) rat_array = np.zeros(users.size) for i, user in tqdm(enumerate(users), total=users.size, desc="\t\tProcessing"): curr_sub = sub[i] curr_sub_index = np.searchsorted(unique_sorted_sub, curr_sub) # Find items of this subclass item_sub = sorted_item_subclass[ sub_indptr[curr_sub_index]:sub_indptr[curr_sub_index + 1]] user_item = URM_train.indices[URM_train.indptr[user]:URM_train. indptr[user + 1]] total_user_likes = user_item.size mask = np.in1d(item_sub, user_item) likes_per_sub = item_sub[mask].size user_p = likes_per_sub / total_user_likes perc_array[i] = user_p rat_array[i] = likes_per_sub data_frame["subclass_user_like_perc"] = perc_array data_frame["subclass_user_like_quantity"] = rat_array if not add_subclass: data_frame = data_frame.drop(columns=["subclass"], inplace=False) return data_frame
def create_swivel_inputs( output_dir: Path, log: logging.Logger, coocs_matrix: csr_matrix, shard_size: int, row_vocab: List, col_vocab: Optional[List] = None, ): """Create and save Swivel inputs from a given co-occurence matrix. If column vocabulary is not given, the matrix must be square (and should be symmetrical).""" if coocs_matrix.shape[0] != len(row_vocab): log.error("Row vocabulary and matrix shape do not match, aborting") raise RuntimeError if col_vocab and coocs_matrix.shape[1] != len(col_vocab): log.error("Column vocabulary and matrix shape do not match, aborting") raise RuntimeError elif not col_vocab and coocs_matrix.shape[0] != coocs_matrix.shape[1]: log.error( "Co-occurence matrix is not square but no column vocabulary was provided, aborting" ) raise RuntimeError log.info("Creating and saving the rows vocabulary and sums ... ") row_reorder = create_vocabulary_sums_inputs(output_dir, "row", log, coocs_matrix.indptr, shard_size, row_vocab) row_nshards = len(row_reorder) // shard_size if col_vocab: log.info("Creating and saving the columns vocabulary and sums ... ") col_reorder = create_vocabulary_sums_inputs( output_dir, "col", log, coocs_matrix.tocsc().indptr, shard_size, col_vocab) col_nshards = len(col_reorder) // shard_size else: col_reorder = row_reorder col_nshards = row_nshards for filename in [VOCABULARY_FILENAME, SUMS_FILENAME]: row_filepath = (output_dir / (filename % "row")).as_posix() col_filepath = (output_dir / (filename % "col")).as_posix() log.info("Copying %s to %s ...", row_filepath, col_filepath) shutil.copyfile(row_filepath, col_filepath) n_shards = row_nshards * col_nshards log.info("Creating and saving the %d shards ...", n_shards) with tqdm(total=n_shards) as progress: for row in range(row_nshards): indices_row = row_reorder[row::row_nshards] for col in range(col_nshards): indices_col = col_reorder[col::col_nshards] shard = coocs_matrix[indices_row][:, indices_col].tocoo() tf_shard = tf.train.Example(features=tf.train.Features( feature={ "global_row": format_int_list(indices_row), "global_col": format_int_list(indices_col), "sparse_local_row": format_int_list(shard.row), "sparse_local_col": format_int_list(shard.col), "sparse_value": format_float_list(shard.data), })) with (output_dir / (SHARDS_FILENAME % (row, col))).open(mode="rb") as fout: fout.write(tf_shard.SerializeToString()) progress.update(1)