def test_als_method_match(): lu = als.ImplicitMF(20, iterations=15, method='lu', rng_spec=42) cg = als.ImplicitMF(20, iterations=15, method='cg', rng_spec=42) ratings = lktu.ml_test.ratings timer = Stopwatch() lu.fit(ratings) timer.stop() _log.info('fit with LU solver in %s', timer) timer = Stopwatch() cg.fit(ratings) timer.stop() _log.info('fit with CG solver in %s', timer) preds = [] rng = util.rng(42, legacy=True) for u in rng.choice(ratings.user.unique(), 10, replace=False): items = rng.choice(ratings.item.unique(), 15, replace=False) lu_preds = lu.predict_for_user(u, items) cd_preds = cg.predict_for_user(u, items) diff = lu_preds - cd_preds adiff = np.abs(diff) _log.info( 'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9)) preds.append( pd.DataFrame({ 'user': u, 'item': items, 'lu': lu_preds, 'cg': cd_preds, 'adiff': adiff })) _log.info('user %s tau: %s', u, stats.kendalltau(lu_preds, cd_preds)) preds = pd.concat(preds, ignore_index=True) _log.info('LU preds:\n%s', preds.lu.describe()) _log.info('CD preds:\n%s', preds.cg.describe()) _log.info('overall differences:\n%s', preds.adiff.describe()) # there are differences. our check: the 90% are reasonable assert np.quantile(adiff, 0.9) < 0.5
def test_als_method_match(): lu = als.BiasedMF(20, iterations=15, reg=(2, 0.001), method='lu', rng_spec=42) cd = als.BiasedMF(20, iterations=20, reg=(2, 0.001), method='cd', rng_spec=42) ratings = lktu.ml_test.ratings timer = Stopwatch() lu.fit(ratings) timer.stop() _log.info('fit with LU solver in %s', timer) timer = Stopwatch() cd.fit(ratings) timer.stop() _log.info('fit with CD solver in %s', timer) assert lu.bias.mean_ == approx(ratings.rating.mean()) assert cd.bias.mean_ == approx(ratings.rating.mean()) preds = [] rng = util.rng(42, legacy=True) for u in rng.choice(np.unique(ratings.user), 15, replace=False): items = rng.choice(np.unique(ratings.item), 15, replace=False) lu_preds = lu.predict_for_user(u, items) cd_preds = cd.predict_for_user(u, items) diff = lu_preds - cd_preds adiff = np.abs(diff) _log.info('user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9)) preds.append(pd.DataFrame({ 'user': u, 'item': items, 'lu': lu_preds, 'cd': cd_preds, 'adiff': adiff })) preds = pd.concat(preds, ignore_index=True) _log.info('LU preds:\n%s', preds.lu.describe()) _log.info('CD preds:\n%s', preds.cd.describe()) _log.info('overall differences:\n%s', preds.adiff.describe()) # there are differences. our check: the 90% are under a quarter star assert np.quantile(adiff, 0.9) <= 0.27
def run_model(model, env, inst, cfg, *, var='gender'): """ Run a STAN model. """ seed = stan_seed(inst, var) data = env.profiles.loc[inst, :] _log.info('running profile model on %d profiles for %s', len(data), inst) timer = Stopwatch() stan_data = {'J': len(data)} if var == 'gender': stan_data['n'] = data['Known'] stan_data['y'] = data['female'] out_pfx = 'profile' elif var == 'dcode': stan_data['n'] = data['dcknown'] stan_data['y'] = data['dcyes'] out_pfx = 'profile-dcode' else: raise ValueError(f'unknown variant {var}') fit = model.sampling(stan_data, seed=seed, check_hmc_diagnostics=True, **cfg) _log.info('profile sample for %s finished in %s', inst, timer) summary = fit.stansummary(pars=["mu", "sigma", "thetaP", "nP", "yP"]) print(summary) (data_dir / inst / f'{out_pfx}-model.txt').write_text(summary) _log.info('extracting samples') samples = fit.extract(permuted=True) write_samples(data_dir / inst / f'{out_pfx}-samples.h5', samples) _log.info('pickling model and fit') with dt.zstd_write(data_dir / inst / f'{out_pfx}-fit.pkl.zstd') as ff: pickle.dump((model, fit), ff, protocol=4)
def inspect(opts): _log.info('inspecting file %s', opts.path) stat = opts.path.stat() _log.info('file size: %s (%s)', stat.st_size, binarysize(stat.st_size)) timer = Stopwatch() with opts.path.open('rb') as f: model = pickle.load(f) timer.stop() gc.collect() res = resource.getrusage(resource.RUSAGE_SELF) _log.info('loaded model in %s', timer) _log.info('max RSS %s', binarysize(res.ru_maxrss * 1024)) bufs = PBJar() timer = Stopwatch() p_bytes = pickle5.dumps(model, protocol=5, buffer_callback=bufs) timer.stop() bsize = bufs.total_size() _log.info('pickled to %d bytes in %s', len(p_bytes), timer) _log.info('with %d bytes of buffers', bsize) _log.info('total size: %s', binarysize(len(p_bytes) + bsize)) _log.info('compresses to: %s', binarysize(len(p_bytes) + bufs.encoded_size()))
def do_measure(opts): name = opts['-d'] _log.info('reading data %s', name) test = pd.read_parquet(f'data/{name}-test.parquet') recs = pd.read_parquet(f'data/{name}-recs.parquet') _log.info('setting up analysis') rla = RecListAnalysis() rla.add_metric(ndcg) rla.add_metric(recip_rank) timer = Stopwatch() results = rla.compute(recs, test, include_missing=True) _log.info('analyzed in %s', timer) results = results.fillna(0) a_res = results.groupby('Algorithm').mean() a_res['count'] = results.groupby('Algorithm')['nrecs'].count() _log.info('finished') print(a_res) print(results.groupby('Algorithm')['recip_rank'].describe())
model = args.get('ALGO') dsname = args.get('DATASET') _log.info('importing from module %s', mod_name) algorithms = importlib.import_module(mod_name) _log.info('locating model %s', model) algo = getattr(algorithms, model) _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings _log.info('training model') algo = Recommender.adapt(algo) timer = Stopwatch() algo.fit(ratings) timer.stop() _log.info('trained model in %s', timer) if resource: res = resource.getrusage(resource.RUSAGE_SELF) _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime, res.ru_stime, res.ru_maxrss / 1024) if out is None: out = f'models/{dsname}-{model}.pkl.gz' _log.info('writing to %s', out) pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True) with gzip.open(out, 'wb') as f: pickle.dump(algo, f, 4)
from lkdemo import datasets, log from lenskit.util import Stopwatch _log = log.script(__file__) args = docopt(__doc__) n = int(args['-n']) if args['-d']: _log.info('using data %s', args['-d']) data = getattr(datasets, args['-d']) items = data.movies else: data = None items = None _log.info('reading from %s', args['MODEL']) with gzip.open(args['MODEL'], 'rb') as f: algo = pickle.load(f) for u in args['USER']: u = int(u) timer = Stopwatch() _log.info('getting %d recs for user %d', n, u) recs = algo.recommend(u, n) if items is not None: recs = recs.join(items, how='left', on='item') print('recommendations for', u) print(recs) _log.info('completed recommendations in %s', timer)