def get_train_data(max_rows=None): product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) rows = [] num_rows = 0 for shard_idx in tqdm(range(cfg.NUM_SHARDS)): for js in tqdm(json.loads(s) for s in open(get_shard_path(shard_idx))): rows.append( make_coo_row(js["transaction_history"], product_encoder, normalize=True)) num_rows += 1 if max_rows and num_rows == max_rows: return sp.vstack(rows) trans_mat = sp.vstack(rows) return trans_mat
def collect_cooccur_matrix(shard_indices, product_encoder): num_products = product_encoder.num_products co_occurrence = np.zeros((num_products, num_products)) occurrence = np.zeros(num_products) for shard_idx in tqdm(shard_indices): for js in tqdm( (json.loads(s) for s in open(get_shard_path(shard_idx)))): tids = js.get("transaction_history", []) for tid in tids: product_ind = [ product_encoder.toIdx(item["product_id"]) for item in tid.get("products", []) ] for pid_num, pid in enumerate(product_ind): occurrence[pid] += 1 for co_pid in product_ind[pid_num + 1:]: co_occurrence[co_pid][pid] += 1 co_occurrence[pid][co_pid] += 1 return co_occurrence, occurrence
print("\t This is a new AP leader!") output_dir = output_root + "/ap/" os.makedirs(output_dir, exist_ok=True) print("\t Save state to {}".format(output_dir)) torch.save(user_model.state_dict(), output_dir + "/user_model.pth") torch.save(item_model.state_dict(), output_dir + "/item_model.pth") json.dump(stats, open(output_dir + "/stats.json", "w")) if __name__ == "__main__": np.random.seed(42) torch.manual_seed(43) product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) train_samples = collect_train_data( [get_shard_path(i) for i in range(8)], product_encoder, ) valid_samples = collect_train_data([get_shard_path(15)], product_encoder) dim = 512 user_model = UserModel(product_encoder.num_products, dim).cuda() item_model = ItemModel(product_encoder.num_products, dim).cuda() criterion = nn.CosineEmbeddingLoss(margin=0.01).cuda() optimizer = torch.optim.Adam(list(user_model.parameters()) + list(item_model.parameters()), lr=0.01) epoches = ([{ "num_batches": 512,
item_cost = item["s"] / max(item["quantity"], 1) if storage[key] == 0: storage[key] = item_cost else: storage[key] = (storage[key] + item_cost) / 2.0 if __name__ == "__main__": product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) num_products = product_encoder.num_products items_cost = defaultdict(int) rows = [] num_transactions = 0 for i in tqdm(range(cfg.NUM_SHARDS)): for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))): update_item_cost(js["transaction_history"], product_encoder, items_cost) rows.append( make_coo_row( js["transaction_history"], product_encoder, normalize=False ) ) num_transactions += len(js["transaction_history"]) trans_mat = sp.vstack(rows) items_cnt = trans_mat.sum(axis=0).A[0] df_top_items = ( pd.Series(items_cnt, name="items_cnt").sort_values(ascending=False).to_frame() ) df_items_cost = pd.Series(items_cost, name="cost").to_frame() df_misc_features = df_top_items.join(df_items_cost)
dfs.append( feather.read_dataframe(f"{TRAIN_DIR}/df_train_{num_shard}.feather") ) logger.info("Join chunks to full train dataframe") df_gbm_train = pd.concat(dfs, sort=False) logger.info(f"Shape of the train dataframe {df_gbm_train.shape}") del dfs gc.collect() logger.info("Loading test dataset") df_gbm_test = feather.read_dataframe(TEST_DIR / f"df_test_{NUM_TEST_SHARD}.feather") gt_all_rec_test = [] for js in tqdm( (json.loads(s) for s in open(get_shard_path(NUM_TEST_SHARD))), leave=False ): target_products = set( product_encoder.toIdx([pid for pid in js["target"][0]["product_ids"]]) ) gt_products = dict(client_id=js["client_id"], products=list(target_products)) gt_all_rec_test.append(gt_products) logger.info(f"Shape of the test dataframe {df_gbm_test.shape}") logger.info("Add query_id column") df_gbm_train["query_id"] = df_gbm_train.groupby("client_id").ngroup() df_gbm_test["query_id"] = df_gbm_test.groupby("client_id").ngroup() logger.info("Build LGB datasets") drop_cols = ["client_id", "target", "query_id"] train_ds = lgb.Dataset(
from tqdm import tqdm import config as cfg from utils import ( ProductEncoder, get_shard_path, make_coo_row, normalized_average_precision, ) if __name__ == "__main__": product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) rows = [] for i in range(cfg.NUM_SHARDS - 1): for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))): rows.append( make_coo_row(js["transaction_history"], product_encoder, normalize=True)) train_mat = sp.vstack(rows) model = implicit.nearest_neighbours.CosineRecommender(K=2) # model = implicit.nearest_neighbours.CosineRecommender(K=50) # model = implicit.nearest_neighbours.TFIDFRecommender(K=100) # ALS should be trained with normalize = False # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12) model.fit(train_mat.T) out_dir = cfg.ASSETS_DIR