def measure_algo(options: OptionReader): afn = options.algo_fn lfile = data_dir / options.data / 'eval' / f'{afn}.log' opt = data_dir / options.data / 'tuning' / f'{afn}.json' seed = init_rng(rng_seed(), 'eval-algo', options.data, options.algo) _log.info('using random seed %s', seed) with LogFile(lfile): if options.time: train, test = _load_time_ratings(options) else: train, test = _load_ratings(options) if options.pretrained: mfn = data_dir / options.data / 'eval' / f'{afn}.model' _log.info('loading model from %s', mfn) with mfn.open('rb') as f: algo = pickle.load(f) else: implicit = 'rating' not in train.columns algo = get_algorithm(options.data, options.algo, None if options.default else opt, implicit) algo = _train_algo(options.data, algo, train) if options.rerank: gender = getBookGender() _measure_rerank(algo, train, test, options.n, afn, gender) else: _measure_raw(algo, test, options.n, afn)
def split(opts: OptionReader): data = dt.fname(opts.data) ddir = data_dir / data tdir = ddir / opts.subdir seed = init_rng(rng_seed(), 'split-ratings', data, opts.subdir) _log.info('using random seed %s', seed) _log.info('reading ratings') ratings = pd.read_parquet(ddir / 'ratings.parquet') _log.info('counting users in %d ratings', len(ratings)) users = ratings.groupby('user')['item'].count() candidates = users[users >= opts.min_ratings] _log.info('selecting %d of %d candidate users (%d total)', opts.test_users, len(candidates), len(users)) sample = candidates.sample(opts.test_users, random_state=rng(legacy=True)) _log.info('selecting test ratings') u_rates = ratings[ratings['user'].isin(sample.index)] test = u_rates.groupby('user').apply(lambda df: df.sample(1)) test.reset_index('user', drop=True, inplace=True) _log.info('writing %d test ratings', len(test)) test.to_parquet(tdir / 'test-ratings.parquet', compression='snappy')
def train(options: TrainOptions): seed = init_rng(rng_seed(), 'train-model', options.data, options.algo) _log.info('using random seed %s', seed) ddir = data_dir / options.data rating_file = ddir / 'ratings.parquet' if options.train_data == 'all': mdir = ddir / 'models' elif options.train_data == 'eval': mdir = ddir / 'eval' else: raise ValueError(f'unknown training data {options.train_data}') mdir.mkdir(parents=True, exist_ok=True) mfn = mdir / f'{options.algo_fn}.model' if options.default: _log.warn('Using default settings') opt_fn = None else: opt_fn = ddir / 'tuning' / f'{options.algo_fn}.json' _log.info('Using algorithm optimization results %s', opt_fn) with LogFile(mdir / f'{options.algo_fn}.log'): _log.info('reading ratings from %s', rating_file) ratings = pd.read_parquet(rating_file) if options.drop_ratings and 'rating' in ratings.columns: _log.info('dropping rating column') ratings = ratings.drop(columns=['rating']) if options.train_data == 'eval': _log.info('reading test data') test = pd.read_parquet(ddir / 'eval' / 'test-ratings.parquet') train_mask = pd.Series(True, index=ratings.index) train_mask[test.index] = False ratings = ratings[train_mask].copy().reset_index(drop=True) implicit = 'rating' not in ratings.columns _log.info('loading algorithm %s for %s in %s mode', options.data, options.algo, 'implicit' if implicit else 'explicit') algo = get_algorithm(options.data, options.algo, opt_fn, implicit) algo = Recommender.adapt(algo) _log.info('training %s on %s ratings', algo, len(ratings)) timer = Stopwatch() model = algo.fit(ratings) timer.stop() _log.info('trained in %s', timer) _log.info('saving model to %s', mfn) with open(mfn, 'wb') as f: p = dt.CompactingPickler(f, protocol=4) p.dump(model)
def sample(options): data = dt.fname(options.data) seed = init_rng(rng_seed(), 'sample-users', data) _log.info('using random seed %s', seed) ds = dt.datasets[data] kr_query = f''' SELECT r.user_id AS user, COUNT(book_id) AS profile_size FROM {ds.table} r JOIN cluster_first_author_gender g ON g.cluster = r.book_id WHERE gender = 'male' OR gender = 'female' GROUP BY r.user_id HAVING COUNT(book_id) >= {options.min_ratings} ''' _log.info('loading users for %s', data) valid_users = pd.read_sql(kr_query, db_uri()) _log.info('found %d viable profiles, sampling %d', len(valid_users), options.sample_size) sample = valid_users.sample(options.sample_size, random_state=rng(legacy=True)) ddir = data_dir / data u_fn = ddir / 'sample-users.csv' _log.info('writing %s', u_fn) sample.to_csv(ddir / 'sample-users.csv', index=False) ratings = pd.read_parquet(ddir / 'ratings.parquet') ratings = pd.merge(sample[['user']], ratings) r_fn = ddir / 'sample-ratings.csv' _log.info('writing %d ratings to %s', len(ratings), r_fn) ratings.to_csv(r_fn, index=False) s_fn = ddir / 'sample-stats.json' _log.info('writing stats to %s', s_fn) s_fn.write_text(json.dumps({ 'viable': len(valid_users), 'sampled': options.sample_size, 'ratings': len(ratings) }))
def init_rng(request): util.init_rng(42)
def inspect(file): _log.info('loading file %s', file) opt = skopt.load(file) n = len(opt.x_iters) print('iterations:', n) print('optimal HR:', -opt.fun) for i in range(n): x = opt.x_iters[i] nhr = opt.func_vals[i] if hasattr(opt, 'iter_time'): time = opt.iter_time[i] else: time = np.nan print('iter[{}]: {!r} -> {:f} ({:.1f}s)'.format(i, x, -nhr, time)) if __name__ == '__main__': options = docopt(__doc__) insp_file = options['--inspect'] if insp_file: inspect(insp_file) else: seed = init_rng(rng_seed(), 'search-model', options['<dataset>'], options['--out-name']) _log.info('using random seed %s', seed) eval = setup(options['<dataset>'], options['<algorithm>'], options['--drop-ratings']) run_search(options['<dataset>'], options['<algorithm>'], options['--out-name'], eval)