示例#1
0
def get_train_data(max_rows=None):
    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)

    rows = []
    num_rows = 0
    for shard_idx in tqdm(range(cfg.NUM_SHARDS)):
        for js in tqdm(json.loads(s) for s in open(get_shard_path(shard_idx))):
            rows.append(
                make_coo_row(js["transaction_history"],
                             product_encoder,
                             normalize=True))
            num_rows += 1

            if max_rows and num_rows == max_rows:
                return sp.vstack(rows)

    trans_mat = sp.vstack(rows)
    return trans_mat
示例#2
0
def collect_cooccur_matrix(shard_indices, product_encoder):
    num_products = product_encoder.num_products
    co_occurrence = np.zeros((num_products, num_products))
    occurrence = np.zeros(num_products)
    for shard_idx in tqdm(shard_indices):
        for js in tqdm(
            (json.loads(s) for s in open(get_shard_path(shard_idx)))):
            tids = js.get("transaction_history", [])
            for tid in tids:
                product_ind = [
                    product_encoder.toIdx(item["product_id"])
                    for item in tid.get("products", [])
                ]
                for pid_num, pid in enumerate(product_ind):
                    occurrence[pid] += 1
                    for co_pid in product_ind[pid_num + 1:]:
                        co_occurrence[co_pid][pid] += 1
                        co_occurrence[pid][co_pid] += 1
    return co_occurrence, occurrence
示例#3
0
        print("\t This is a new AP leader!")
        output_dir = output_root + "/ap/"
        os.makedirs(output_dir, exist_ok=True)
        print("\t Save state to {}".format(output_dir))
        torch.save(user_model.state_dict(), output_dir + "/user_model.pth")
        torch.save(item_model.state_dict(), output_dir + "/item_model.pth")
        json.dump(stats, open(output_dir + "/stats.json", "w"))


if __name__ == "__main__":
    np.random.seed(42)
    torch.manual_seed(43)

    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
    train_samples = collect_train_data(
        [get_shard_path(i) for i in range(8)],
        product_encoder,
    )
    valid_samples = collect_train_data([get_shard_path(15)], product_encoder)

    dim = 512
    user_model = UserModel(product_encoder.num_products, dim).cuda()
    item_model = ItemModel(product_encoder.num_products, dim).cuda()

    criterion = nn.CosineEmbeddingLoss(margin=0.01).cuda()
    optimizer = torch.optim.Adam(list(user_model.parameters()) +
                                 list(item_model.parameters()),
                                 lr=0.01)

    epoches = ([{
        "num_batches": 512,
示例#4
0
            item_cost = item["s"] / max(item["quantity"], 1)
            if storage[key] == 0:
                storage[key] = item_cost
            else:
                storage[key] = (storage[key] + item_cost) / 2.0


if __name__ == "__main__":
    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
    num_products = product_encoder.num_products

    items_cost = defaultdict(int)
    rows = []
    num_transactions = 0
    for i in tqdm(range(cfg.NUM_SHARDS)):
        for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))):
            update_item_cost(js["transaction_history"], product_encoder, items_cost)
            rows.append(
                make_coo_row(
                    js["transaction_history"], product_encoder, normalize=False
                )
            )
            num_transactions += len(js["transaction_history"])
    trans_mat = sp.vstack(rows)

    items_cnt = trans_mat.sum(axis=0).A[0]
    df_top_items = (
        pd.Series(items_cnt, name="items_cnt").sort_values(ascending=False).to_frame()
    )
    df_items_cost = pd.Series(items_cost, name="cost").to_frame()
    df_misc_features = df_top_items.join(df_items_cost)
            dfs.append(
                feather.read_dataframe(f"{TRAIN_DIR}/df_train_{num_shard}.feather")
            )

    logger.info("Join chunks to full train dataframe")
    df_gbm_train = pd.concat(dfs, sort=False)
    logger.info(f"Shape of the train dataframe {df_gbm_train.shape}")

    del dfs
    gc.collect()

    logger.info("Loading test dataset")
    df_gbm_test = feather.read_dataframe(TEST_DIR / f"df_test_{NUM_TEST_SHARD}.feather")
    gt_all_rec_test = []
    for js in tqdm(
        (json.loads(s) for s in open(get_shard_path(NUM_TEST_SHARD))), leave=False
    ):
        target_products = set(
            product_encoder.toIdx([pid for pid in js["target"][0]["product_ids"]])
        )
        gt_products = dict(client_id=js["client_id"], products=list(target_products))
        gt_all_rec_test.append(gt_products)
    logger.info(f"Shape of the test dataframe {df_gbm_test.shape}")

    logger.info("Add query_id column")
    df_gbm_train["query_id"] = df_gbm_train.groupby("client_id").ngroup()
    df_gbm_test["query_id"] = df_gbm_test.groupby("client_id").ngroup()

    logger.info("Build LGB datasets")
    drop_cols = ["client_id", "target", "query_id"]
    train_ds = lgb.Dataset(
from tqdm import tqdm

import config as cfg
from utils import (
    ProductEncoder,
    get_shard_path,
    make_coo_row,
    normalized_average_precision,
)

if __name__ == "__main__":
    product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)

    rows = []
    for i in range(cfg.NUM_SHARDS - 1):
        for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))):
            rows.append(
                make_coo_row(js["transaction_history"],
                             product_encoder,
                             normalize=True))
    train_mat = sp.vstack(rows)

    model = implicit.nearest_neighbours.CosineRecommender(K=2)
    # model = implicit.nearest_neighbours.CosineRecommender(K=50)
    # model = implicit.nearest_neighbours.TFIDFRecommender(K=100)

    # ALS should be trained with normalize = False
    # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12)
    model.fit(train_mat.T)

    out_dir = cfg.ASSETS_DIR