def intersect_drop_columns(train: csr_matrix, valid: csr_matrix, min_df=0):
    t = train.tocsc()
    v = valid.tocsc()
    nnz_train = ((t != 0).sum(axis=0) >= min_df).A1
    nnz_valid = ((v != 0).sum(axis=0) >= min_df).A1
    nnz_cols = nnz_train & nnz_valid
    res = t[:, nnz_cols], v[:, nnz_cols]
    return res
Пример #2
0
def train_thresholding(y: sparse.csr_matrix, x: sparse.csr_matrix,
                       options: str):
    """Trains a linear model for multilabel data using a one-vs-rest strategy
    and cross-validation to pick an optimal decision threshold for Macro-F1.
    Outperforms train_1vsrest in most aspects at the cost of higher
    time complexity.
    See user guide for more details.

    Args:
        y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
        x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
        options (str): The option string passed to liblinear.

    Returns:
        A model which can be used in predict_values.
    """
    # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
    x, options, bias = prepare_options(x, options)

    y = y.tocsc()
    num_class = y.shape[1]
    num_feature = x.shape[1]
    weights = np.zeros((num_feature, num_class), order='F')
    thresholds = np.zeros(num_class)
    for i in range(num_class):
        yi = y[:, i].toarray().reshape(-1)
        w, t = thresholding_one_label(2 * yi - 1, x, options)
        weights[:, i] = w.ravel()
        thresholds[i] = t

    return {
        'weights': np.asmatrix(weights),
        '-B': bias,
        'threshold': thresholds
    }
Пример #3
0
def test_tversky_index(X: sps.csr_matrix, alpha: float, beta: float,
                       shrinkage: float) -> None:
    RNS = np.random.RandomState(0)
    rec = TverskyIndexKNNRecommender(X,
                                     shrinkage=shrinkage,
                                     alpha=alpha,
                                     beta=beta,
                                     n_threads=1,
                                     top_k=X.shape[1])
    rec.learn()
    sim = rec.W.toarray()
    tested_index_row = RNS.randint(0, sim.shape[0], size=100)
    tested_index_col = RNS.randint(0, sim.shape[0], size=100)
    X_csc = X.tocsc()
    X_csc.sorted_indices()
    for i, j in zip(tested_index_row, tested_index_col):
        if i == j:
            continue
        computed = sim[i, j]
        U_i = set(X_csc[:, i].nonzero()[0])
        U_j = set(X_csc[:, j].nonzero()[0])
        intersect = U_i.intersection(U_j)
        Ui_minus_Uj = U_i.difference(U_j)
        Uj_minus_Ui = U_j.difference(U_i)
        target = len(intersect) / (len(intersect) + alpha * len(Ui_minus_Uj) +
                                   beta * len(Uj_minus_Ui) + shrinkage + 1e-6)
        assert computed == pytest.approx(target)
Пример #4
0
def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str):
    """Trains a linear model for multiabel data using a one-vs-rest strategy.

    Args:
        y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
        x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
        options (str): The option string passed to liblinear.

    Returns:
        A model which can be used in predict_values.
    """
    # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
    x, options, bias = prepare_options(x, options)

    y = y.tocsc()
    num_class = y.shape[1]
    num_feature = x.shape[1]
    weights = np.zeros((num_feature, num_class), order='F')
    for i in range(num_class):
        yi = y[:, i].toarray().reshape(-1)
        modeli = train(2 * yi - 1, x, options)
        w = np.ctypeslib.as_array(modeli.w, (num_feature, ))
        # Liblinear flips +1/-1 labels so +1 is always the first label,
        # but not if all labels are -1.
        # For our usage, we need +1 to always be the first label,
        # so the check is necessary.
        if modeli.get_labels()[0] == -1:
            weights[:, i] = -w
        else:
            weights[:, i] = w

    return {'weights': np.asmatrix(weights), '-B': bias, 'threshold': 0}
Пример #5
0
def format_URM_positive_user_compressed(URM: csr_matrix):
    """
    Format positive interactions of an URM in the way that is needed for the FM model.
    Here, however, users information are grouped w.r.t. items, meaning that, we will have:
    - We have #warm_items @row
    - We have #users+items+1 @cols
    - We have #(interactions)+(warm_items*2) @data

    Each row is representing a warm item and all users that interacted with that item are stored in that row.

    :param URM: URM to be preprocessed
    :return: preprocessed URM in sparse matrix csr format
    """

    warm_items_mask = np.ediff1d(URM.tocsc().indptr) > 0
    warm_items = np.arange(URM.shape[1])[warm_items_mask]

    new_train = URM.copy().tocoo()
    fm_matrix = coo_matrix((warm_items.size, URM.shape[0] + URM.shape[1] + 1),
                           dtype=np.int8)

    # Index offset
    item_offset = URM.shape[0]

    # Set up initial vectors
    row_v = np.zeros(new_train.data.size + (warm_items.size * 2))
    col_v = np.zeros(new_train.data.size + (warm_items.size * 2))
    data_v = np.zeros(new_train.data.size +
                      (warm_items.size * 2))  # Already ok, nothing to be added

    # For all the items, set up its content
    j = 0  # Index to scan and modify the vectors
    URM_train_csc = URM.copy().tocsc()
    for i, item in enumerate(warm_items):
        # Find all users who liked that item
        users_who_liked_item = URM_train_csc[:, item].indices
        offset = users_who_liked_item.size
        if offset > 0:
            col_v[j:j + offset] = users_who_liked_item
            row_v[j:j + offset] = i
            data_v[j:j + offset] = 1

            col_v[j + offset] = item + item_offset
            row_v[j + offset] = i
            data_v[j + offset] = 1

            col_v[j + offset + 1] = fm_matrix.shape[1] - 1
            row_v[j + offset + 1] = i
            data_v[j + offset + 1] = 1

            j = j + offset + 2
        else:
            raise RuntimeError("Illegal state")

    # Setting new information
    fm_matrix.row = row_v
    fm_matrix.col = col_v
    fm_matrix.data = data_v

    return fm_matrix.tocsr()
Пример #6
0
def train_cost_sensitive_micro(y: sparse.csr_matrix, x: sparse.csr_matrix,
                               options: str):
    """Trains a linear model for multilabel data using a one-vs-rest strategy
    and cross-validation to pick an optimal asymmetric misclassification cost
    for Micro-F1.
    Outperforms train_1vsrest in most aspects at the cost of higher
    time complexity.
    See user guide for more details.

    Args:
        y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
        x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
        options (str): The option string passed to liblinear.

    Returns:
        A model which can be used in predict_values.
    """
    # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
    x, options, bias = prepare_options(x, options)

    y = y.tocsc()
    num_class = y.shape[1]
    num_feature = x.shape[1]
    weights = np.zeros((num_feature, num_class), order='F')

    l = y.shape[0]
    perm = np.random.permutation(l)
    param_space = [1, 1.33, 1.8, 2.5, 3.67, 6, 13]
    bestScore = -np.Inf
    for a in param_space:
        tp = fn = fp = 0
        for i in range(num_class):
            yi = y[:, i].toarray().reshape(-1)
            yi = 2 * yi - 1

            cv_options = f'{options} -w1 {a}'
            pred = cross_validate(yi, x, cv_options, perm)
            tp = tp + np.sum(np.logical_and(yi == 1, pred == 1))
            fn = fn + np.sum(np.logical_and(yi == 1, pred == -1))
            fp = fp + np.sum(np.logical_and(yi == -1, pred == 1))

        score = 2 * tp / (2 * tp + fn + fp)
        if bestScore < score:
            bestScore = score
            bestA = a

    final_options = f'{options} -w1 {bestA}'
    for i in range(num_class):
        yi = y[:, i].toarray().reshape(-1)
        w = do_train(2 * yi - 1, x, final_options)
        weights[:, i] = w.ravel()

    return {'weights': np.asmatrix(weights), '-B': bias, 'threshold': 0}
Пример #7
0
    def _preprocess_URM_all(self, URM_all: sps.csr_matrix):
        warm_items_mask = np.ediff1d(URM_all.tocsc().indptr) > self.threshold_items
        self.warm_items = np.arange(URM_all.shape[1])[warm_items_mask]

        URM_all = URM_all[:, self.warm_items]

        warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > self.threshold_users
        self.warm_users = np.arange(URM_all.shape[0])[warm_users_mask]

        URM_all = URM_all[self.warm_users, :]

        self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = reconcile_mapper_with_removed_tokens(
            self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"],
            np.arange(0, len(warm_users_mask), dtype=np.int)[np.logical_not(warm_users_mask)])

        self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"] = reconcile_mapper_with_removed_tokens(
            self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"],
            np.arange(0, len(warm_items_mask), dtype=np.int)[np.logical_not(warm_items_mask)])

        return URM_all
Пример #8
0
def advanced_subclass_handling(data_frame: pd.DataFrame,
                               URM_train: csr_matrix,
                               path="../../data/",
                               add_subclass=False):
    """
    Here we want to include in the training set sub class information in the following way:
    - A column encoding the mean of 'label' for a certain couple (user, subclass): i.e. how many
    items of that subclass the user liked
    - Including information about the popularity of the subclass (how many items for that subclass
    - Including ratings of that subclass

    :param URM_train: mean response will be retrieved from here
    :param data_frame: dataframe being pre-processed for boosting
    :param path: path to the folder containing subclass dataframe
    :return: dataframe with augmented information
    """
    print("Adding subclass and feature engineering subclass...")
    data_frame = data_frame.copy()

    df_subclass: pd.DataFrame = pd.read_csv(path + "data_ICM_sub_class.csv")
    df_subclass = df_subclass[['row', 'col']]
    df_subclass = df_subclass.rename(columns={"col": "subclass"})

    # Merging sub class information
    data_frame = pd.merge(data_frame,
                          df_subclass,
                          right_on="row",
                          left_on="item_id")
    data_frame = data_frame.drop(columns=["row"], inplace=False)

    print("\t- Add items present for each subclass")
    # Add subclass item-popularity: how many items are present of that subclass
    subclass_item_count = df_subclass.groupby("subclass").count()
    data_frame = pd.merge(data_frame,
                          subclass_item_count,
                          right_index=True,
                          left_on="subclass")
    data_frame = data_frame.rename(columns={"row": "item_per_subclass"})

    print("\t- Add ratings popularity for each subclass")
    # Add subclass ratings-popularity: how many interactions we have for each subclass
    URM_train_csc = URM_train.tocsc()
    n_ratings_sub = []

    sorted_sub_indices = np.argsort(df_subclass['subclass'].values)
    sorted_sub = df_subclass['subclass'][sorted_sub_indices].values
    sorted_item_subclass = df_subclass['row'][sorted_sub_indices].values

    unique_sorted_sub, sub_indptr = np.unique(sorted_sub, return_index=True)
    sub_indptr = np.concatenate([sub_indptr, [sorted_sub.size]])
    for i, sub in tqdm(enumerate(unique_sorted_sub),
                       total=unique_sorted_sub.size,
                       desc="\t\tProcessing"):
        item_sub = sorted_item_subclass[sub_indptr[i]:sub_indptr[i + 1]]
        n_ratings_sub.append(URM_train_csc[:, item_sub].data.size)

    ratings_sub = np.array([unique_sorted_sub, n_ratings_sub])
    ratings_per_sub_df = pd.DataFrame(
        data=np.transpose(ratings_sub),
        columns=["subclass", "global_ratings_per_subclass"])

    data_frame = pd.merge(data_frame,
                          ratings_per_sub_df,
                          left_on="subclass",
                          right_on="subclass")

    # Add subclass ratings-popularity for each user using rating percentage
    print("\t- Add ratings popularity for pairs (user, subclass)")
    users = data_frame['user_id'].values
    sub = data_frame['subclass'].values

    perc_array = np.zeros(users.size)
    rat_array = np.zeros(users.size)
    for i, user in tqdm(enumerate(users),
                        total=users.size,
                        desc="\t\tProcessing"):
        curr_sub = sub[i]
        curr_sub_index = np.searchsorted(unique_sorted_sub, curr_sub)

        # Find items of this subclass
        item_sub = sorted_item_subclass[
            sub_indptr[curr_sub_index]:sub_indptr[curr_sub_index + 1]]
        user_item = URM_train.indices[URM_train.indptr[user]:URM_train.
                                      indptr[user + 1]]

        total_user_likes = user_item.size
        mask = np.in1d(item_sub, user_item)
        likes_per_sub = item_sub[mask].size
        user_p = likes_per_sub / total_user_likes
        perc_array[i] = user_p
        rat_array[i] = likes_per_sub

    data_frame["subclass_user_like_perc"] = perc_array
    data_frame["subclass_user_like_quantity"] = rat_array

    if not add_subclass:
        data_frame = data_frame.drop(columns=["subclass"], inplace=False)

    return data_frame
Пример #9
0
def create_swivel_inputs(
    output_dir: Path,
    log: logging.Logger,
    coocs_matrix: csr_matrix,
    shard_size: int,
    row_vocab: List,
    col_vocab: Optional[List] = None,
):
    """Create and save Swivel inputs from a given co-occurence matrix. If column vocabulary is not
    given, the matrix must be square (and should be symmetrical)."""

    if coocs_matrix.shape[0] != len(row_vocab):
        log.error("Row vocabulary and matrix shape do not match, aborting")
        raise RuntimeError

    if col_vocab and coocs_matrix.shape[1] != len(col_vocab):
        log.error("Column vocabulary and matrix shape do not match, aborting")
        raise RuntimeError
    elif not col_vocab and coocs_matrix.shape[0] != coocs_matrix.shape[1]:
        log.error(
            "Co-occurence matrix is not square but no column vocabulary was provided, aborting"
        )
        raise RuntimeError
    log.info("Creating and saving the rows vocabulary and sums ... ")
    row_reorder = create_vocabulary_sums_inputs(output_dir, "row", log,
                                                coocs_matrix.indptr,
                                                shard_size, row_vocab)
    row_nshards = len(row_reorder) // shard_size
    if col_vocab:
        log.info("Creating and saving the columns vocabulary and sums ... ")
        col_reorder = create_vocabulary_sums_inputs(
            output_dir, "col", log,
            coocs_matrix.tocsc().indptr, shard_size, col_vocab)
        col_nshards = len(col_reorder) // shard_size
    else:
        col_reorder = row_reorder
        col_nshards = row_nshards
        for filename in [VOCABULARY_FILENAME, SUMS_FILENAME]:
            row_filepath = (output_dir / (filename % "row")).as_posix()
            col_filepath = (output_dir / (filename % "col")).as_posix()
            log.info("Copying %s to %s ...", row_filepath, col_filepath)
            shutil.copyfile(row_filepath, col_filepath)
    n_shards = row_nshards * col_nshards
    log.info("Creating and saving the %d shards ...", n_shards)
    with tqdm(total=n_shards) as progress:
        for row in range(row_nshards):
            indices_row = row_reorder[row::row_nshards]
            for col in range(col_nshards):
                indices_col = col_reorder[col::col_nshards]
                shard = coocs_matrix[indices_row][:, indices_col].tocoo()
                tf_shard = tf.train.Example(features=tf.train.Features(
                    feature={
                        "global_row": format_int_list(indices_row),
                        "global_col": format_int_list(indices_col),
                        "sparse_local_row": format_int_list(shard.row),
                        "sparse_local_col": format_int_list(shard.col),
                        "sparse_value": format_float_list(shard.data),
                    }))
                with (output_dir / (SHARDS_FILENAME %
                                    (row, col))).open(mode="rb") as fout:
                    fout.write(tf_shard.SerializeToString())

                progress.update(1)