Пример #1
0
def test_stratified_splitter(test_specs, python_dataset):
    splits = python_stratified_split(python_dataset,
                                     ratio=test_specs["ratio"],
                                     min_rating=10,
                                     filter_by="user")

    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"])

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if both contains the same user list. This is because stratified split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()

    assert set(users_train) == set(users_test)

    splits = python_stratified_split(python_dataset,
                                     ratio=test_specs["ratios"],
                                     min_rating=10,
                                     filter_by="user")

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"])
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"])

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)
Пример #2
0
def test_model_lightgcn(resource_path):
    data_path = os.path.join(resource_path, "..", "resources", "deeprec",
                             "dkn")
    yaml_file = os.path.join(
        resource_path,
        "..",
        "..",
        "reco_utils",
        "recommender",
        "deeprec",
        "config",
        "lightgcn.yaml",
    )
    user_file = os.path.join(data_path, r"user_embeddings.csv")
    item_file = os.path.join(data_path, r"item_embeddings.csv")

    df = movielens.load_pandas_df(size="100k")
    train, test = python_stratified_split(df, ratio=0.75)

    data = ImplicitCF(train=train, test=test)

    hparams = prepare_hparams(yaml_file, epochs=1)
    model = LightGCN(hparams, data)

    assert model.run_eval() is not None
    model.fit()
    assert model.recommend_k_items(test) is not None
    model.infer_embedding(user_file, item_file)
    assert os.path.getsize(user_file) != 0
    assert os.path.getsize(item_file) != 0
Пример #3
0
def test_lightgcn_component_definition(resource_path):
    yaml_file = os.path.join(
        resource_path,
        "..",
        "..",
        "reco_utils",
        "recommender",
        "deeprec",
        "config",
        "lightgcn.yaml",
    )

    df = movielens.load_pandas_df(size="100k")
    train, test = python_stratified_split(df, ratio=0.75)

    data = ImplicitCF(train=train, test=test)

    embed_size = 64
    hparams = prepare_hparams(yaml_file, embed_size=embed_size)
    model = LightGCN(hparams, data)

    assert model.norm_adj is not None
    assert model.ua_embeddings.shape == [data.n_users, embed_size]
    assert model.ia_embeddings.shape == [data.n_items, embed_size]
    assert model.u_g_embeddings is not None
    assert model.pos_i_g_embeddings is not None
    assert model.neg_i_g_embeddings is not None
    assert model.batch_ratings is not None
    assert model.loss is not None
    assert model.opt is not None
def test_stratified_splitter(test_specs, python_dataset):
    splits = python_stratified_split(
        python_dataset, ratio=test_specs["ratio"], min_rating=10, filter_by="user"
    )

    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if both contains the same user list. This is because stratified split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()

    assert set(users_train) == set(users_test)

    splits = python_stratified_split(
        python_dataset, ratio=test_specs["ratios"], min_rating=10, filter_by="user"
    )

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)
def SARtrain():
    data = pd.read_csv("SnacksData100.csv")
    data.loc[:, 'Ratings'] = data['Ratings'].astype(np.float32)
    header = {
        "col_user": "******",
        "col_item": "Product_Id",
        "col_rating": "Ratings",
        "col_timestamp": "timestamp",
    }
    train, test = python_stratified_split(data,
                                          ratio=0.75,
                                          col_user=header["col_user"],
                                          col_item=header["col_item"],
                                          seed=42)
    joblib.dump(test, 'testdata')
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')
    model = SARSingleNode(similarity_type="jaccard",
                          time_decay_coefficient=30,
                          time_now=None,
                          timedecay_formula=True,
                          **header)
    model.fit(train)
    joblib.dump(model, 'SARDump')
Пример #6
0
def train_and_score(n_users, n_recipes_per_cuisine, review_ratio,
                    review_ratio_fav, top_k, weight_ratings):

    data_type = 'synthetic'
    data_path = DATA_DIR + data_type

    features_fn = data_path + '/recipes/cuisine_size_{}.csv'.format(
        n_recipes_per_cuisine)
    review_fn = data_path + '/reviews/{}_users_{}_ratio_{}_ratiofav.csv'.format(
        n_users, review_ratio, review_ratio_fav)

    data = pd.read_csv(review_fn)
    features = pd.read_csv(features_fn)

    # Convert ingredients column to list
    features["clean_ingredients"] = features["clean_ingredients"].apply(
        lambda a: a.split("+"))

    # Convert the float precision to 32-bit in order to reduce memory consumption
    data.loc[:, 'rating'] = data['rating'].astype(np.float32)

    header = {
        "col_user": "******",
        "col_item": "recipe_id",
        "col_rating": "rating",
        "col_timestamp": "date",
        "col_prediction": "Prediction",
    }

    # split train and test and add dummy user reviewing every recipe in train
    train, test = python_stratified_split(data,
                                          ratio=0.80,
                                          col_user=header["col_user"],
                                          col_item=header["col_item"],
                                          seed=42)
    for r in features["recipe_id"]:
        dummy = pd.DataFrame([["dummy", r, 3]],
                             columns=['username', "recipe_id", "rating"])
        train = train.append(dummy, ignore_index=True)

    # build model
    model = SARSingleNode(similarity_type="custom",
                          time_decay_coefficient=30,
                          time_now=None,
                          timedecay_formula=False,
                          **header)

    jaccard = lambda a, b: len(set(a).intersection(set(b))) / len(
        set(a).union(set(b)))

    print('\ttraining model')

    model.fit(train, features, "recipe_id", {
        "ratings": weight_ratings,
        "clean_ingredients": (1, jaccard)
    })

    print('\tscoring')

    # get score
    absolute_scores = []
    relative_scores = []
    for i in top_k:
        absolute, relative = accuracy_metric(model, test, i)
        absolute_scores.append(absolute)
        relative_scores.append(relative)

    return absolute_scores, relative_scores
Пример #7
0
                                                     exp_var)
from reco_utils.recommender.sar import SAR

# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

ratings = pd.read_csv('./data/ml-100k/ratings.csv')
ratings.columns = ["userID", "itemID", "rating", "timestamp"]
# ratings.head()

train, test = python_stratified_split(ratings,
                                      ratio=0.75,
                                      col_user='******',
                                      col_item='itemID',
                                      seed=42)

# print("""
# Train:
# Total Ratings: {train_total}
# Unique Users: {train_users}
# Unique Items: {train_items}

# Test:
# Total Ratings: {test_total}
# Unique Users: {test_users}
# Unique Items: {test_items}
# """.format(
#     train_total=len(train),
Пример #8
0
      "Total number of items are\t{}".format(data[COL_ITEM].nunique()),
      sep="\n")

# Change the format of the timestamp column
orig_function = data.apply(lambda x: datetime.strftime(
    datetime(1970, 1, 1, 0, 0, 0) + timedelta(seconds=x[COL_TIMESTAMP].item()),
    "%Y-%m-%d %H:%M:%S"),
                           axis=1)

print(data.head(orig_function))

iso_8601_python = data.apply(lambda x: x[COL_TIMESTAMP].item().isoformat(),
                             axis=1)

print(data.head(iso_8601_python))

data[COL_TIMESTAMP] = data.apply(lambda x: x[COL_TIMESTAMP].item().isoformat(),
                                 axis=1)

#%%

data.head()

# Stratified split the data
data_train, data_test = python_stratified_split(data,
                                                filter_by="user",
                                                min_rating=10,
                                                ratio=0.7,
                                                col_user=COL_USER,
                                                col_item=COL_ITEM)
Пример #9
0
    seed = args.seed

    logger.debug(f"Received parameters:")
    logger.debug(f"Ratio:    {ratio}")
    logger.debug(f"User:    {col_user}")
    logger.debug(f"Item:    {col_item}")
    logger.debug(f"Seed:    {seed}")

    logger.debug(f"Input path: {args.input_path}")
    logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
    logger.debug(f"Cols of DataFrame: {input_df.columns}")

    output_train, output_test = python_stratified_split(
        input_df,
        ratio=args.ratio,
        col_user=args.col_user,
        col_item=args.col_item,
        seed=args.seed,
    )

    logger.debug(f"Output path: {args.output_train}")
    logger.debug(f"Output path: {args.output_test}")

    save_data_frame_to_directory(
        args.output_train,
        output_train,
        schema=DataFrameSchema.data_frame_to_dict(output_train),
    )
    save_data_frame_to_directory(
        args.output_test,
        output_test,
Пример #10
0
# top k items to recommend
TOP_K = 10

# select movieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# model parameters
N_FACTORS = 40
EPOCHS = 5

ratings = pd.read_csv('./data/ml-100k/ratings.csv')

# split the dataset
train_valid_df, test_df = python_stratified_split(ratings_df,
                                                  ratio=0.75,
                                                  min_rating=1,
                                                  filter_by="item",
                                                  col_user=USER,
                                                  col_item=ITEM)

data = CollabDataBunch.from_df(train_valid_df,
                               user_name=USER,
                               item_name=ITEM,
                               rating_name=RATING,
                               valid_pct=0)
# data.show_batch()
"""Now we will create a `collab_learner` for the data, which by default uses 
the `EmbeddingDotBias` model. We will be using 40 latent factors. This will 
create an embedding for the users and the items that will map each of these 
to 40 floats as can be seen below. Note that the embedding parameters are not 
predefined, but are learned by the model.
Пример #11
0
        title_col='Title')

    # Convert the float precision to 32-bit in order to reduce memory consumption
    data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)

    header = {
        "col_user": "******",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
        "col_prediction": "Prediction",
    }

    train, test = python_stratified_split(data,
                                          ratio=0.75,
                                          col_user=header["col_user"],
                                          col_item=header["col_item"],
                                          seed=42)

    # set log level to INFO
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    model = SARSingleNode(similarity_type="jaccard",
                          time_decay_coefficient=30,
                          time_now=None,
                          timedecay_formula=True,
                          **header)

    model.fit(train)