Python python_stratified_split примеры, reco_utils.dataset.python_splitters.python_stratified_split Python примеры использования

Пример #1

0

Показать файл

def test_stratified_splitter(test_specs, python_dataset):
    splits = python_stratified_split(python_dataset,
                                     ratio=test_specs["ratio"],
                                     min_rating=10,
                                     filter_by="user")

    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"])

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if both contains the same user list. This is because stratified split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()

    assert set(users_train) == set(users_test)

    splits = python_stratified_split(python_dataset,
                                     ratio=test_specs["ratios"],
                                     min_rating=10,
                                     filter_by="user")

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"])
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"])

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

Пример #2

0

Показать файл

def test_model_lightgcn(resource_path):
    data_path = os.path.join(resource_path, "..", "resources", "deeprec",
                             "dkn")
    yaml_file = os.path.join(
        resource_path,
        "..",
        "..",
        "reco_utils",
        "recommender",
        "deeprec",
        "config",
        "lightgcn.yaml",
    )
    user_file = os.path.join(data_path, r"user_embeddings.csv")
    item_file = os.path.join(data_path, r"item_embeddings.csv")

    df = movielens.load_pandas_df(size="100k")
    train, test = python_stratified_split(df, ratio=0.75)

    data = ImplicitCF(train=train, test=test)

    hparams = prepare_hparams(yaml_file, epochs=1)
    model = LightGCN(hparams, data)

    assert model.run_eval() is not None
    model.fit()
    assert model.recommend_k_items(test) is not None
    model.infer_embedding(user_file, item_file)
    assert os.path.getsize(user_file) != 0
    assert os.path.getsize(item_file) != 0

Пример #3

0

Показать файл

def test_lightgcn_component_definition(resource_path):
    yaml_file = os.path.join(
        resource_path,
        "..",
        "..",
        "reco_utils",
        "recommender",
        "deeprec",
        "config",
        "lightgcn.yaml",
    )

    df = movielens.load_pandas_df(size="100k")
    train, test = python_stratified_split(df, ratio=0.75)

    data = ImplicitCF(train=train, test=test)

    embed_size = 64
    hparams = prepare_hparams(yaml_file, embed_size=embed_size)
    model = LightGCN(hparams, data)

    assert model.norm_adj is not None
    assert model.ua_embeddings.shape == [data.n_users, embed_size]
    assert model.ia_embeddings.shape == [data.n_items, embed_size]
    assert model.u_g_embeddings is not None
    assert model.pos_i_g_embeddings is not None
    assert model.neg_i_g_embeddings is not None
    assert model.batch_ratings is not None
    assert model.loss is not None
    assert model.opt is not None

Пример #4

0

Показать файл

Файл: test_python_splitter.py Проект: David-Li-L/recommenders

def test_stratified_splitter(test_specs, python_dataset):
    splits = python_stratified_split(
        python_dataset, ratio=test_specs["ratio"], min_rating=10, filter_by="user"
    )

    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if both contains the same user list. This is because stratified split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()

    assert set(users_train) == set(users_test)

    splits = python_stratified_split(
        python_dataset, ratio=test_specs["ratios"], min_rating=10, filter_by="user"
    )

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

Пример #5

0

Показать файл

Файл: SARmodel.py Проект: GayatriReddiar/INFO7374-Algorithmic-Digital-Marketing

def SARtrain():
    data = pd.read_csv("SnacksData100.csv")
    data.loc[:, 'Ratings'] = data['Ratings'].astype(np.float32)
    header = {
        "col_user": "******",
        "col_item": "Product_Id",
        "col_rating": "Ratings",
        "col_timestamp": "timestamp",
    }
    train, test = python_stratified_split(data,
                                          ratio=0.75,
                                          col_user=header["col_user"],
                                          col_item=header["col_item"],
                                          seed=42)
    joblib.dump(test, 'testdata')
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')
    model = SARSingleNode(similarity_type="jaccard",
                          time_decay_coefficient=30,
                          time_now=None,
                          timedecay_formula=True,
                          **header)
    model.fit(train)
    joblib.dump(model, 'SARDump')

Пример #6

0

Показать файл

def train_and_score(n_users, n_recipes_per_cuisine, review_ratio,
                    review_ratio_fav, top_k, weight_ratings):

    data_type = 'synthetic'
    data_path = DATA_DIR + data_type

    features_fn = data_path + '/recipes/cuisine_size_{}.csv'.format(
        n_recipes_per_cuisine)
    review_fn = data_path + '/reviews/{}_users_{}_ratio_{}_ratiofav.csv'.format(
        n_users, review_ratio, review_ratio_fav)

    data = pd.read_csv(review_fn)
    features = pd.read_csv(features_fn)

    # Convert ingredients column to list
    features["clean_ingredients"] = features["clean_ingredients"].apply(
        lambda a: a.split("+"))

    # Convert the float precision to 32-bit in order to reduce memory consumption
    data.loc[:, 'rating'] = data['rating'].astype(np.float32)

    header = {
        "col_user": "******",
        "col_item": "recipe_id",
        "col_rating": "rating",
        "col_timestamp": "date",
        "col_prediction": "Prediction",
    }

    # split train and test and add dummy user reviewing every recipe in train
    train, test = python_stratified_split(data,
                                          ratio=0.80,
                                          col_user=header["col_user"],
                                          col_item=header["col_item"],
                                          seed=42)
    for r in features["recipe_id"]:
        dummy = pd.DataFrame([["dummy", r, 3]],
                             columns=['username', "recipe_id", "rating"])
        train = train.append(dummy, ignore_index=True)

    # build model
    model = SARSingleNode(similarity_type="custom",
                          time_decay_coefficient=30,
                          time_now=None,
                          timedecay_formula=False,
                          **header)

    jaccard = lambda a, b: len(set(a).intersection(set(b))) / len(
        set(a).union(set(b)))

    print('\ttraining model')

    model.fit(train, features, "recipe_id", {
        "ratings": weight_ratings,
        "clean_ingredients": (1, jaccard)
    })

    print('\tscoring')

    # get score
    absolute_scores = []
    relative_scores = []
    for i in top_k:
        absolute, relative = accuracy_metric(model, test, i)
        absolute_scores.append(absolute)
        relative_scores.append(relative)

    return absolute_scores, relative_scores

Пример #7

0

Показать файл

Файл: rec-nb-sar.py Проект: sparsh-ai/reco-base

                                                     exp_var)
from reco_utils.recommender.sar import SAR

# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

ratings = pd.read_csv('./data/ml-100k/ratings.csv')
ratings.columns = ["userID", "itemID", "rating", "timestamp"]
# ratings.head()

train, test = python_stratified_split(ratings,
                                      ratio=0.75,
                                      col_user='******',
                                      col_item='itemID',
                                      seed=42)

# print("""
# Train:
# Total Ratings: {train_total}
# Unique Users: {train_users}
# Unique Items: {train_items}

# Test:
# Total Ratings: {test_total}
# Unique Users: {test_users}
# Unique Items: {test_items}
# """.format(
#     train_total=len(train),

Пример #8

0

Показать файл

Файл: data.py Проект: ThomasAger/recommenders

      "Total number of items are\t{}".format(data[COL_ITEM].nunique()),
      sep="\n")

# Change the format of the timestamp column
orig_function = data.apply(lambda x: datetime.strftime(
    datetime(1970, 1, 1, 0, 0, 0) + timedelta(seconds=x[COL_TIMESTAMP].item()),
    "%Y-%m-%d %H:%M:%S"),
                           axis=1)

print(data.head(orig_function))

iso_8601_python = data.apply(lambda x: x[COL_TIMESTAMP].item().isoformat(),
                             axis=1)

print(data.head(iso_8601_python))

data[COL_TIMESTAMP] = data.apply(lambda x: x[COL_TIMESTAMP].item().isoformat(),
                                 axis=1)

#%%

data.head()

# Stratified split the data
data_train, data_test = python_stratified_split(data,
                                                filter_by="user",
                                                min_rating=10,
                                                ratio=0.7,
                                                col_user=COL_USER,
                                                col_item=COL_ITEM)

Пример #9

0

Показать файл

    seed = args.seed

    logger.debug(f"Received parameters:")
    logger.debug(f"Ratio:    {ratio}")
    logger.debug(f"User:    {col_user}")
    logger.debug(f"Item:    {col_item}")
    logger.debug(f"Seed:    {seed}")

    logger.debug(f"Input path: {args.input_path}")
    logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
    logger.debug(f"Cols of DataFrame: {input_df.columns}")

    output_train, output_test = python_stratified_split(
        input_df,
        ratio=args.ratio,
        col_user=args.col_user,
        col_item=args.col_item,
        seed=args.seed,
    )

    logger.debug(f"Output path: {args.output_train}")
    logger.debug(f"Output path: {args.output_test}")

    save_data_frame_to_directory(
        args.output_train,
        output_train,
        schema=DataFrameSchema.data_frame_to_dict(output_train),
    )
    save_data_frame_to_directory(
        args.output_test,
        output_test,

Пример #10

0

Показать файл

# top k items to recommend
TOP_K = 10

# select movieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# model parameters
N_FACTORS = 40
EPOCHS = 5

ratings = pd.read_csv('./data/ml-100k/ratings.csv')

# split the dataset
train_valid_df, test_df = python_stratified_split(ratings_df,
                                                  ratio=0.75,
                                                  min_rating=1,
                                                  filter_by="item",
                                                  col_user=USER,
                                                  col_item=ITEM)

data = CollabDataBunch.from_df(train_valid_df,
                               user_name=USER,
                               item_name=ITEM,
                               rating_name=RATING,
                               valid_pct=0)
# data.show_batch()
"""Now we will create a `collab_learner` for the data, which by default uses 
the `EmbeddingDotBias` model. We will be using 40 latent factors. This will 
create an embedding for the users and the items that will map each of these 
to 40 floats as can be seen below. Note that the embedding parameters are not 
predefined, but are learned by the model.

Пример #11

0

Показать файл

Файл: item_cf.py Проект: bjmsong/recommenders

        title_col='Title')

    # Convert the float precision to 32-bit in order to reduce memory consumption
    data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)

    header = {
        "col_user": "******",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
        "col_prediction": "Prediction",
    }

    train, test = python_stratified_split(data,
                                          ratio=0.75,
                                          col_user=header["col_user"],
                                          col_item=header["col_item"],
                                          seed=42)

    # set log level to INFO
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    model = SARSingleNode(similarity_type="jaccard",
                          time_decay_coefficient=30,
                          time_now=None,
                          timedecay_formula=True,
                          **header)

    model.fit(train)

Python python_stratified_split примеры использования