def split(opts: OptionReader): data = dt.fname(opts.data) ddir = data_dir / data tdir = ddir / opts.subdir seed = init_rng(rng_seed(), 'split-ratings', data, opts.subdir) _log.info('using random seed %s', seed) _log.info('reading ratings') ratings = pd.read_parquet(ddir / 'ratings.parquet') _log.info('counting users in %d ratings', len(ratings)) users = ratings.groupby('user')['item'].count() candidates = users[users >= opts.min_ratings] _log.info('selecting %d of %d candidate users (%d total)', opts.test_users, len(candidates), len(users)) sample = candidates.sample(opts.test_users, random_state=rng(legacy=True)) _log.info('selecting test ratings') u_rates = ratings[ratings['user'].isin(sample.index)] test = u_rates.groupby('user').apply(lambda df: df.sample(1)) test.reset_index('user', drop=True, inplace=True) _log.info('writing %d test ratings', len(test)) test.to_parquet(tdir / 'test-ratings.parquet', compression='snappy')
def make_graph(rng_spec=None): "Construct a TensorFlow graph (with an optional random seed)" rng = util.rng(rng_spec) graph = tf.Graph() graph.seed = rng.integers(2**31 - 1) _log.info('using effective random seed %s (from %s)', graph.seed, rng_spec) return graph
def test_als_method_match(): lu = als.BiasedMF(20, iterations=15, reg=(2, 0.001), method='lu', rng_spec=42) cd = als.BiasedMF(20, iterations=20, reg=(2, 0.001), method='cd', rng_spec=42) ratings = lktu.ml_test.ratings timer = Stopwatch() lu.fit(ratings) timer.stop() _log.info('fit with LU solver in %s', timer) timer = Stopwatch() cd.fit(ratings) timer.stop() _log.info('fit with CD solver in %s', timer) assert lu.global_bias_ == approx(ratings.rating.mean()) assert cd.global_bias_ == approx(ratings.rating.mean()) preds = [] rng = util.rng(42, legacy=True) for u in rng.choice(np.unique(ratings.user), 15, replace=False): items = rng.choice(np.unique(ratings.item), 15, replace=False) lu_preds = lu.predict_for_user(u, items) cd_preds = cd.predict_for_user(u, items) diff = lu_preds - cd_preds adiff = np.abs(diff) _log.info( 'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9)) preds.append( pd.DataFrame({ 'user': u, 'item': items, 'lu': lu_preds, 'cd': cd_preds, 'adiff': adiff })) preds = pd.concat(preds, ignore_index=True) _log.info('LU preds:\n%s', preds.lu.describe()) _log.info('CD preds:\n%s', preds.cd.describe()) _log.info('overall differences:\n%s', preds.adiff.describe()) # there are differences. our check: the 90% are under a quarter star assert np.quantile(adiff, 0.9) <= 0.25
def test_als_method_match(): lu = als.ImplicitMF(20, iterations=15, method='lu', rng_spec=42) cg = als.ImplicitMF(20, iterations=15, method='cg', rng_spec=42) ratings = lktu.ml_test.ratings timer = Stopwatch() lu.fit(ratings) timer.stop() _log.info('fit with LU solver in %s', timer) timer = Stopwatch() cg.fit(ratings) timer.stop() _log.info('fit with CG solver in %s', timer) preds = [] rng = util.rng(42, legacy=True) for u in rng.choice(ratings.user.unique(), 10, replace=False): items = rng.choice(ratings.item.unique(), 15, replace=False) lu_preds = lu.predict_for_user(u, items) cd_preds = cg.predict_for_user(u, items) diff = lu_preds - cd_preds adiff = np.abs(diff) _log.info( 'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9)) preds.append( pd.DataFrame({ 'user': u, 'item': items, 'lu': lu_preds, 'cg': cd_preds, 'adiff': adiff })) _log.info('user %s tau: %s', u, stats.kendalltau(lu_preds, cd_preds)) preds = pd.concat(preds, ignore_index=True) _log.info('LU preds:\n%s', preds.lu.describe()) _log.info('CD preds:\n%s', preds.cg.describe()) _log.info('overall differences:\n%s', preds.adiff.describe()) # there are differences. our check: the 90% are reasonable assert np.quantile(adiff, 0.9) < 0.5
def sample(options): data = dt.fname(options.data) seed = init_rng(rng_seed(), 'sample-users', data) _log.info('using random seed %s', seed) ds = dt.datasets[data] kr_query = f''' SELECT r.user_id AS user, COUNT(book_id) AS profile_size FROM {ds.table} r JOIN cluster_first_author_gender g ON g.cluster = r.book_id WHERE gender = 'male' OR gender = 'female' GROUP BY r.user_id HAVING COUNT(book_id) >= {options.min_ratings} ''' _log.info('loading users for %s', data) valid_users = pd.read_sql(kr_query, db_uri()) _log.info('found %d viable profiles, sampling %d', len(valid_users), options.sample_size) sample = valid_users.sample(options.sample_size, random_state=rng(legacy=True)) ddir = data_dir / data u_fn = ddir / 'sample-users.csv' _log.info('writing %s', u_fn) sample.to_csv(ddir / 'sample-users.csv', index=False) ratings = pd.read_parquet(ddir / 'ratings.parquet') ratings = pd.merge(sample[['user']], ratings) r_fn = ddir / 'sample-ratings.csv' _log.info('writing %d ratings to %s', len(ratings), r_fn) ratings.to_csv(r_fn, index=False) s_fn = ddir / 'sample-stats.json' _log.info('writing stats to %s', s_fn) s_fn.write_text(json.dumps({ 'viable': len(valid_users), 'sampled': options.sample_size, 'ratings': len(ratings) }))
def fit(self, ratings, **kwargs): timer = util.Stopwatch() rng = util.rng(self.rng_spec) matrix, users, items = sparse_ratings(ratings[['user', 'item']]) _log.info('[%s] setting up model', timer) train, model = self._build_model(len(users), len(items)) _log.info('[%s] preparing training dataset', timer) train_data = BprInputs(matrix, self.batch_size, self.neg_count, rng) _log.info('[%s] training model', timer) train.fit(train_data, epochs=self.epochs) _log.info('[%s] model finished', timer) self.user_index_ = users self.item_index_ = items self.model = model return self
def legacy_rng(): return util.rng(42, legacy_rng=True)
def rng(): return util.rng(42)