def test_stratified_splitter(test_specs, python_dataset): splits = python_stratified_split(python_dataset, ratio=test_specs["ratio"], min_rating=10, filter_by="user") assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"]) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"]) for split in splits: assert set(split.columns) == set(python_dataset.columns) # Test if both contains the same user list. This is because stratified split is stratified. users_train = splits[0][DEFAULT_USER_COL].unique() users_test = splits[1][DEFAULT_USER_COL].unique() assert set(users_train) == set(users_test) splits = python_stratified_split(python_dataset, ratio=test_specs["ratios"], min_rating=10, filter_by="user") assert len(splits) == 3 assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"]) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"]) assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"]) for split in splits: assert set(split.columns) == set(python_dataset.columns)
def test_model_lightgcn(resource_path): data_path = os.path.join(resource_path, "..", "resources", "deeprec", "dkn") yaml_file = os.path.join( resource_path, "..", "..", "reco_utils", "recommender", "deeprec", "config", "lightgcn.yaml", ) user_file = os.path.join(data_path, r"user_embeddings.csv") item_file = os.path.join(data_path, r"item_embeddings.csv") df = movielens.load_pandas_df(size="100k") train, test = python_stratified_split(df, ratio=0.75) data = ImplicitCF(train=train, test=test) hparams = prepare_hparams(yaml_file, epochs=1) model = LightGCN(hparams, data) assert model.run_eval() is not None model.fit() assert model.recommend_k_items(test) is not None model.infer_embedding(user_file, item_file) assert os.path.getsize(user_file) != 0 assert os.path.getsize(item_file) != 0
def test_lightgcn_component_definition(resource_path): yaml_file = os.path.join( resource_path, "..", "..", "reco_utils", "recommender", "deeprec", "config", "lightgcn.yaml", ) df = movielens.load_pandas_df(size="100k") train, test = python_stratified_split(df, ratio=0.75) data = ImplicitCF(train=train, test=test) embed_size = 64 hparams = prepare_hparams(yaml_file, embed_size=embed_size) model = LightGCN(hparams, data) assert model.norm_adj is not None assert model.ua_embeddings.shape == [data.n_users, embed_size] assert model.ia_embeddings.shape == [data.n_items, embed_size] assert model.u_g_embeddings is not None assert model.pos_i_g_embeddings is not None assert model.neg_i_g_embeddings is not None assert model.batch_ratings is not None assert model.loss is not None assert model.opt is not None
def test_stratified_splitter(test_specs, python_dataset): splits = python_stratified_split( python_dataset, ratio=test_specs["ratio"], min_rating=10, filter_by="user" ) assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"] ) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"] ) for split in splits: assert set(split.columns) == set(python_dataset.columns) # Test if both contains the same user list. This is because stratified split is stratified. users_train = splits[0][DEFAULT_USER_COL].unique() users_test = splits[1][DEFAULT_USER_COL].unique() assert set(users_train) == set(users_test) splits = python_stratified_split( python_dataset, ratio=test_specs["ratios"], min_rating=10, filter_by="user" ) assert len(splits) == 3 assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"] ) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"] ) assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"] ) for split in splits: assert set(split.columns) == set(python_dataset.columns)
def SARtrain(): data = pd.read_csv("SnacksData100.csv") data.loc[:, 'Ratings'] = data['Ratings'].astype(np.float32) header = { "col_user": "******", "col_item": "Product_Id", "col_rating": "Ratings", "col_timestamp": "timestamp", } train, test = python_stratified_split(data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42) joblib.dump(test, 'testdata') logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') model = SARSingleNode(similarity_type="jaccard", time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header) model.fit(train) joblib.dump(model, 'SARDump')
def train_and_score(n_users, n_recipes_per_cuisine, review_ratio, review_ratio_fav, top_k, weight_ratings): data_type = 'synthetic' data_path = DATA_DIR + data_type features_fn = data_path + '/recipes/cuisine_size_{}.csv'.format( n_recipes_per_cuisine) review_fn = data_path + '/reviews/{}_users_{}_ratio_{}_ratiofav.csv'.format( n_users, review_ratio, review_ratio_fav) data = pd.read_csv(review_fn) features = pd.read_csv(features_fn) # Convert ingredients column to list features["clean_ingredients"] = features["clean_ingredients"].apply( lambda a: a.split("+")) # Convert the float precision to 32-bit in order to reduce memory consumption data.loc[:, 'rating'] = data['rating'].astype(np.float32) header = { "col_user": "******", "col_item": "recipe_id", "col_rating": "rating", "col_timestamp": "date", "col_prediction": "Prediction", } # split train and test and add dummy user reviewing every recipe in train train, test = python_stratified_split(data, ratio=0.80, col_user=header["col_user"], col_item=header["col_item"], seed=42) for r in features["recipe_id"]: dummy = pd.DataFrame([["dummy", r, 3]], columns=['username', "recipe_id", "rating"]) train = train.append(dummy, ignore_index=True) # build model model = SARSingleNode(similarity_type="custom", time_decay_coefficient=30, time_now=None, timedecay_formula=False, **header) jaccard = lambda a, b: len(set(a).intersection(set(b))) / len( set(a).union(set(b))) print('\ttraining model') model.fit(train, features, "recipe_id", { "ratings": weight_ratings, "clean_ingredients": (1, jaccard) }) print('\tscoring') # get score absolute_scores = [] relative_scores = [] for i in top_k: absolute, relative = accuracy_metric(model, test, i) absolute_scores.append(absolute) relative_scores.append(relative) return absolute_scores, relative_scores
exp_var) from reco_utils.recommender.sar import SAR # top k items to recommend TOP_K = 10 # Select MovieLens data size: 100k, 1m, 10m, or 20m MOVIELENS_DATA_SIZE = '100k' ratings = pd.read_csv('./data/ml-100k/ratings.csv') ratings.columns = ["userID", "itemID", "rating", "timestamp"] # ratings.head() train, test = python_stratified_split(ratings, ratio=0.75, col_user='******', col_item='itemID', seed=42) # print(""" # Train: # Total Ratings: {train_total} # Unique Users: {train_users} # Unique Items: {train_items} # Test: # Total Ratings: {test_total} # Unique Users: {test_users} # Unique Items: {test_items} # """.format( # train_total=len(train),
"Total number of items are\t{}".format(data[COL_ITEM].nunique()), sep="\n") # Change the format of the timestamp column orig_function = data.apply(lambda x: datetime.strftime( datetime(1970, 1, 1, 0, 0, 0) + timedelta(seconds=x[COL_TIMESTAMP].item()), "%Y-%m-%d %H:%M:%S"), axis=1) print(data.head(orig_function)) iso_8601_python = data.apply(lambda x: x[COL_TIMESTAMP].item().isoformat(), axis=1) print(data.head(iso_8601_python)) data[COL_TIMESTAMP] = data.apply(lambda x: x[COL_TIMESTAMP].item().isoformat(), axis=1) #%% data.head() # Stratified split the data data_train, data_test = python_stratified_split(data, filter_by="user", min_rating=10, ratio=0.7, col_user=COL_USER, col_item=COL_ITEM)
seed = args.seed logger.debug(f"Received parameters:") logger.debug(f"Ratio: {ratio}") logger.debug(f"User: {col_user}") logger.debug(f"Item: {col_item}") logger.debug(f"Seed: {seed}") logger.debug(f"Input path: {args.input_path}") logger.debug(f"Shape of loaded DataFrame: {input_df.shape}") logger.debug(f"Cols of DataFrame: {input_df.columns}") output_train, output_test = python_stratified_split( input_df, ratio=args.ratio, col_user=args.col_user, col_item=args.col_item, seed=args.seed, ) logger.debug(f"Output path: {args.output_train}") logger.debug(f"Output path: {args.output_test}") save_data_frame_to_directory( args.output_train, output_train, schema=DataFrameSchema.data_frame_to_dict(output_train), ) save_data_frame_to_directory( args.output_test, output_test,
# top k items to recommend TOP_K = 10 # select movieLens data size: 100k, 1m, 10m, or 20m MOVIELENS_DATA_SIZE = '100k' # model parameters N_FACTORS = 40 EPOCHS = 5 ratings = pd.read_csv('./data/ml-100k/ratings.csv') # split the dataset train_valid_df, test_df = python_stratified_split(ratings_df, ratio=0.75, min_rating=1, filter_by="item", col_user=USER, col_item=ITEM) data = CollabDataBunch.from_df(train_valid_df, user_name=USER, item_name=ITEM, rating_name=RATING, valid_pct=0) # data.show_batch() """Now we will create a `collab_learner` for the data, which by default uses the `EmbeddingDotBias` model. We will be using 40 latent factors. This will create an embedding for the users and the items that will map each of these to 40 floats as can be seen below. Note that the embedding parameters are not predefined, but are learned by the model.
title_col='Title') # Convert the float precision to 32-bit in order to reduce memory consumption data.loc[:, 'Rating'] = data['Rating'].astype(np.float32) header = { "col_user": "******", "col_item": "MovieId", "col_rating": "Rating", "col_timestamp": "Timestamp", "col_prediction": "Prediction", } train, test = python_stratified_split(data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42) # set log level to INFO logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') model = SARSingleNode(similarity_type="jaccard", time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header) model.fit(train)