Пример #1
0
    def fit(self, df):
        """Train the logistic regression model."""

        df_impressions = fb.build_features(df)

        # Target column, item that was clicked
        f.print_time("target column")
        df_impressions.loc[:, "is_clicked"] = (
            df_impressions["referenced_item"] ==
            df_impressions["impressed_item"]).astype(int)

        features = [
            "position",
            "prices",
            "interaction_count",
            "is_last_interacted",
        ]

        X = df_impressions[features]
        y = df_impressions.is_clicked

        # Training the actual model
        f.print_time("training logistic regression model")
        self.logreg = LogisticRegression(solver="lbfgs",
                                         max_iter=100,
                                         tol=1e-11,
                                         C=1e10).fit(X, y)
Пример #2
0
    def fit(self, df):
        """Calculate item similarity based on session co-occurrence."""

        # Select columns that are of interest for this method
        f.print_time("start")
        cols = [
            'user_id', 'session_id', 'timestamp', 'step', 'action_type',
            'reference'
        ]
        df_cols = df.loc[:, cols]

        # We are only interested in action types, for wich the reference is an item ID
        f.print_time("filter interactions")
        item_interactions = [
            'clickout item', 'interaction item deals',
            'interaction item image', 'interaction item info',
            'interaction item rating', 'search for item'
        ]
        df_actions = (
            df_cols.loc[df_cols.action_type.isin(item_interactions), :].rename(
                columns={
                    'reference': 'item'
                }).drop(columns='action_type'))

        df_item_sim = nn.calc_item_sims(df_actions, "item", "session_id")

        self.df_item_sim = df_item_sim
Пример #3
0
    def fit(self, df):
        """Calculate item similarity based on item metadata."""

        # Explode property arrays
        f.print_time("explode properties")
        df_properties = dff.explode(df, "properties")

        df_item_sim = nn.calc_item_sims(df_properties, "item_id", "properties")

        self.df_item_sim = df_item_sim
Пример #4
0
    def predict(self, df):
        """Sort the impression list by number of distinct users in the training phase."""

        # Select columns that are of interest for this method
        f.print_time("start")
        cols = ['user_id', 'session_id', 'timestamp', 'step',
                'action_type', 'reference', "impressions"]
        df_cols = df.loc[:, cols] 

        # Target row, withheld item ID that needs to be predicted
        f.print_time("target rows")
        df_target = dff.get_target_rows(df_cols)

        # Explode to impression level
        f.print_time("explode impression array")
        df_impressions = (
            dff.explode(df_target, "impressions")
            .rename(columns={"impressions": "impressed_item"})
        )
        df_impressions = (
            df_impressions
            .merge(
                self.df_pop,
                left_on="impressed_item",
                right_on="item",
                how="left"
            )
        )

        # Summarize recommendations
        f.print_time("summarize recommendations")
        df_rec = dff.summarize_recs(df_impressions, "n_users")

        return df_rec
Пример #5
0
    def predict(self, df):
        """Return items in impressions list in original order."""

        # Target row, withheld item ID that needs to be predicted
        f.print_time("target rows")
        df_target = dff.get_target_rows(df.copy())

        # Summarize recommendations
        f.print_time("summarize recommendations")
        df_target["item_recommendations"] = (df_target.apply(
            lambda x: x.impressions.replace("|", " "), axis=1))

        cols_rec = [
            "user_id", "session_id", "timestamp", "step",
            "item_recommendations"
        ]
        df_rec = df_target.loc[:, cols_rec]

        return df_rec
Пример #6
0
    def predict(self, df):
        """Calculate item ranking based on trained lightGBM model."""

        df_impressions = fb.build_features(df)

        # Target row, withheld item ID that needs to be predicted
        df_impressions = df_impressions[df_impressions.referenced_item.isna()]

        features = [
            "position", "prices", "interaction_count", "is_last_interacted"
        ]

        df_impressions.loc[:, "click_propensity"] = self.gbm.predict(
            df_impressions[features])

        # Summarize recommendations
        f.print_time("summarize recommendations")
        df_rec = dff.summarize_recs(df_impressions, "click_propensity")

        return df_rec
Пример #7
0
    def fit(self, df):
        """Count the number of clicks for each item."""

        # Select columns that are of interest for this method
        f.print_time("start")
        cols = ['user_id', 'session_id', 'timestamp', 'step',
                'action_type', 'reference']
        df_cols = df.loc[:, cols] 

        # We only need to count clickouts per item
        f.print_time("clicks per item")
        df_item_clicks = (
            df_cols
            .loc[df_cols["action_type"] == "clickout item", :]
            .groupby("reference")
            .size()
            .reset_index(name="n_clicks")
            .rename(columns={"reference": "item"})
        )

        self.df_pop = df_item_clicks
Пример #8
0
    def fit(self, df):
        """Train the lightGBM model."""

        df_impressions = fb.build_features(df)

        # Target column, item that was clicked
        f.print_time("target column")
        df_impressions.loc[:, "is_clicked"] = (
            df_impressions["referenced_item"] ==
            df_impressions["impressed_item"]).astype(int)

        features = [
            "position",
            "prices",
            "interaction_count",
            "is_last_interacted",
        ]

        # Bring to format suitable for lightGBM
        f.print_time("lightGBM format")
        X = df_impressions[features]
        y = df_impressions.is_clicked

        q = (df_impressions.groupby(
            ["user_id", "session_id", "timestamp",
             "step"]).size().reset_index(name="query_length").query_length)

        # Training the actual model
        f.print_time("training lightGBM model")
        self.gbm = lgb.LGBMRanker()
        self.gbm.fit(X, y, group=q, verbose=True)
Пример #9
0
    def predict(self, df):
        """Calculate click probability based on trained logistic regression model."""

        df_impressions = fb.build_features(df)

        # Target row, withheld item ID that needs to be predicted
        df_impressions = df_impressions[df_impressions.referenced_item.isna()]

        features = [
            "position", "prices", "interaction_count", "is_last_interacted"
        ]

        # Predict clickout probabilities for each impressed item
        f.print_time("predict clickout item")
        df_impressions.loc[:,
                           "click_probability"] = (self.logreg.predict_proba(
                               df_impressions[features])[:, 1])

        # Summarize recommendations
        f.print_time("summarize recommendations")
        df_rec = dff.summarize_recs(df_impressions, "click_probability")

        return df_rec
Пример #10
0
    def predict(self, df):
        """Randomly sort the impressions list."""

        # Target row, withheld item ID that needs to be predicted
        f.print_time("target rows")
        df_target = dff.get_target_rows(df.copy())

        # Summarize recommendations
        f.print_time("summarize recommendations")
        random.seed(10121)
        df_target.loc[:, "item_recs_list"] = (
            df_target.loc[:, "impressions"].str.split("|").map(
                lambda x: sorted(x, key=lambda k: random.random())))

        df_target.loc[:, "item_recommendations"] = (
            df_target["item_recs_list"].map(lambda arr: ' '.join(arr)))

        cols_rec = [
            "user_id", "session_id", "timestamp", "step",
            "item_recommendations"
        ]
        df_rec = df_target.loc[:, cols_rec]

        return df_rec
Пример #11
0
def calc_item_sims(df, item_col, reference_col):
    """Calculate similarity of items based on nearest neighbor algorithm.

    The final data frame will have similarity scores for pairs of items.

    :param df: Data frame of training data
    :param item_col: Name of data frame column that contains the item ID
    :param reference_col: Name of the reference column, depending on the model either
        1. session_id for the similarity based on session co-occurrences
        2. properties for the similarity based on item metadata
    :return: Data frame with item pairs and similarity scores
    """

    # Create data frame with item and reference indices
    f.print_time("item and reference indices")
    unique_items = df[item_col].unique()
    unique_refs = df[reference_col].unique()

    d_items = {item_col: unique_items, 'item_idx': range(0, len(unique_items))}
    d_refs = {
        reference_col: unique_refs,
        'ref_idx': range(0, len(unique_refs))
    }

    df_items = pd.DataFrame(data=d_items)
    df_refs = pd.DataFrame(data=d_refs)

    df = (df.merge(df_items, how="inner", on=item_col).merge(df_refs,
                                                             how="inner",
                                                             on=reference_col))

    df_idx = (df.loc[:, ["item_idx", "ref_idx"]].assign(
        data=lambda x: 1.).drop_duplicates())

    # Build item co-ooccurrence matrix
    f.print_time("item co-occurrence matrix")
    mat_coo = sparse.coo_matrix(
        (df_idx.data, (df_idx.item_idx, df_idx.ref_idx)))
    mat_item_coo = mat_coo.T.dot(mat_coo)

    # Calculate Cosine similarities
    f.print_time("Cosine similarity")
    inv_occ = np.sqrt(1 / mat_item_coo.diagonal())
    cosine_sim = mat_item_coo.multiply(inv_occ)
    cosine_sim = cosine_sim.T.multiply(inv_occ)

    # Create item similarity data frame
    f.print_time("item similarity data frame")
    idx_ref, idx_item, sim = sparse.find(cosine_sim)
    d_item_sim = {'idx_ref': idx_ref, 'idx_item': idx_item, 'similarity': sim}
    df_item_sim = pd.DataFrame(data=d_item_sim)

    df_item_sim = (df_item_sim.merge(
        df_items.assign(item_ref=df_items[item_col]),
        how="inner",
        left_on="idx_ref",
        right_on="item_idx").merge(
            df_items.assign(item_sim=df_items[item_col]),
            how="inner",
            left_on="idx_item",
            right_on="item_idx").loc[:,
                                     ["item_ref", "item_sim", "similarity"]])

    return df_item_sim
Пример #12
0
def predict_nn(df, df_item_sim):
    """Calculate predictions based on the item similarity scores."""

    # Select columns that are of interest for this function
    f.print_time("start")
    cols = [
        'user_id', 'session_id', 'timestamp', 'step', 'action_type',
        'reference', 'impressions'
    ]
    df_cols = df.loc[:, cols]

    # Get previous reference per user
    f.print_time("previous reference")
    df_cols["previous_reference"] = (df_cols.sort_values(
        by=["user_id", "session_id", "timestamp"],
        ascending=[True, True,
                   True]).groupby(["user_id"])["reference"].shift(1))

    # Target row, withheld item ID that needs to be predicted
    f.print_time("target rows")
    df_target = dff.get_target_rows(df_cols)

    # Explode to impression level
    f.print_time("explode impression array")
    df_impressions = dff.explode(df_target, "impressions")

    df_item_sim["item_ref"] = df_item_sim["item_ref"].astype(str)
    df_item_sim["item_sim"] = df_item_sim["item_sim"].astype(str)

    # Get similarities
    f.print_time("get similarities")
    df_impressions = (df_impressions.merge(
        df_item_sim,
        how="left",
        left_on=["previous_reference", "impressions"],
        right_on=["item_ref", "item_sim"]).fillna(value={
            'similarity': 0
        }).sort_values(by=["user_id", "timestamp", "step", "similarity"],
                       ascending=[True, True, True, False]))

    # Summarize recommendations
    f.print_time("summarize recommendations")
    df_rec = dff.group_concat(df_impressions,
                              ["user_id", "session_id", "timestamp", "step"],
                              "impressions")

    df_rec = (df_rec.rename(columns={
        'impressions': 'item_recommendations'
    }).loc[:, [
        "user_id", "session_id", "timestamp", "step", "item_recommendations"
    ]])

    return df_rec
Пример #13
0
def build_features(df):
    """Build features for the lightGBM and logistic regression model."""

    # Select columns that are of interest for this method
    f.print_time("start")
    cols = [
        'user_id', 'session_id', 'timestamp', 'step', 'action_type',
        'reference', 'impressions', 'prices'
    ]
    df_cols = df.loc[:, cols]

    # We are only interested in action types, for wich the reference is an item ID
    f.print_time("filter interactions")
    item_interactions = [
        'clickout item', 'interaction item deals', 'interaction item image',
        'interaction item info', 'interaction item rating', 'search for item'
    ]
    df_actions = (df_cols.loc[df_cols.action_type.isin(item_interactions), :].
                  copy().rename(columns={'reference': 'referenced_item'}))

    f.print_time("cleaning")
    # Clean of instances that have no reference
    idx_rm = (df_actions.action_type !=
              "clickout item") & (df_actions.referenced_item.isna())
    df_actions = df_actions[~idx_rm]

    # Get item ID of previous interaction of a user in a session
    f.print_time("previous interactions")
    df_actions.loc[:, "previous_item"] = (df_actions.sort_values(
        by=["user_id", "session_id", "timestamp", "step"],
        ascending=[True, True, True,
                   True]).groupby(["user_id"])["referenced_item"].shift(1))

    # Combine the impressions and item column, they both contain item IDs
    # and we can expand the impression lists in the next step to get the total
    # interaction count for an item
    f.print_time("combining columns - impressions")
    df_actions.loc[:,
                   "interacted_item"] = np.where(df_actions.impressions.isna(),
                                                 df_actions.referenced_item,
                                                 df_actions.impressions)
    df_actions = df_actions.drop(columns="impressions")

    # Price array expansion will get easier without NAs
    f.print_time("combining columns - prices")
    df_actions.loc[:, "prices"] = np.where(df_actions.prices.isna(), "",
                                           df_actions.prices)

    # Convert pipe separated lists into columns
    f.print_time("explode arrays")
    df_items = dff.explode_mult(df_actions,
                                ["interacted_item", "prices"]).copy()

    # Feature: Number of previous interactions with an item
    f.print_time("interaction count")
    df_items.loc[:, "interaction_count"] = (df_items.groupby(
        ["user_id", "interacted_item"]).cumcount())

    # Reduce to impression level again
    f.print_time("reduce to impressions")
    df_impressions = (
        df_items[df_items.action_type == "clickout item"].copy().drop(
            columns="action_type").rename(
                columns={"interacted_item": "impressed_item"}))

    # Feature: Position of item in the original list.
    # Items are in original order after the explode for each index
    f.print_time("position feature")
    df_impressions.loc[:, "position"] = (df_impressions.groupby(
        ["user_id", "session_id", "timestamp", "step"]).cumcount() + 1)

    # Feature: Is the impressed item the last interacted item
    f.print_time("last interacted item feature")
    df_impressions.loc[:, "is_last_interacted"] = (
        df_impressions["previous_item"] == df_impressions["impressed_item"]
    ).astype(int)

    f.print_time("change price datatype")
    df_impressions.loc[:, "prices"] = df_impressions.prices.astype(int)

    return_cols = [
        "user_id",
        "session_id",
        "timestamp",
        "step",
        "position",
        "prices",
        "interaction_count",
        "is_last_interacted",
        "referenced_item",
        "impressed_item",
    ]

    df_return = df_impressions[return_cols]

    return df_return