예제 #1
0
def measure_algo(options: OptionReader):
    afn = options.algo_fn
    lfile = data_dir / options.data / 'eval' / f'{afn}.log'
    opt = data_dir / options.data / 'tuning' / f'{afn}.json'

    seed = init_rng(rng_seed(), 'eval-algo', options.data, options.algo)
    _log.info('using random seed %s', seed)

    with LogFile(lfile):
        if options.time:
            train, test = _load_time_ratings(options)
        else:
            train, test = _load_ratings(options)
        if options.pretrained:
            mfn = data_dir / options.data / 'eval' / f'{afn}.model'
            _log.info('loading model from %s', mfn)
            with mfn.open('rb') as f:
                algo = pickle.load(f)
        else:
            implicit = 'rating' not in train.columns
            algo = get_algorithm(options.data, options.algo,
                                 None if options.default else opt,
                                 implicit)
            algo = _train_algo(options.data, algo, train)

        if options.rerank:
            gender = getBookGender()
            _measure_rerank(algo, train, test, options.n, afn, gender)
        else:
            _measure_raw(algo, test, options.n, afn)
def split(opts: OptionReader):
    data = dt.fname(opts.data)
    ddir = data_dir / data
    tdir = ddir / opts.subdir

    seed = init_rng(rng_seed(), 'split-ratings', data, opts.subdir)
    _log.info('using random seed %s', seed)

    _log.info('reading ratings')
    ratings = pd.read_parquet(ddir / 'ratings.parquet')
    _log.info('counting users in %d ratings', len(ratings))
    users = ratings.groupby('user')['item'].count()
    candidates = users[users >= opts.min_ratings]

    _log.info('selecting %d of %d candidate users (%d total)', opts.test_users,
              len(candidates), len(users))
    sample = candidates.sample(opts.test_users, random_state=rng(legacy=True))

    _log.info('selecting test ratings')
    u_rates = ratings[ratings['user'].isin(sample.index)]
    test = u_rates.groupby('user').apply(lambda df: df.sample(1))
    test.reset_index('user', drop=True, inplace=True)

    _log.info('writing %d test ratings', len(test))
    test.to_parquet(tdir / 'test-ratings.parquet', compression='snappy')
예제 #3
0
def train(options: TrainOptions):
    seed = init_rng(rng_seed(), 'train-model', options.data, options.algo)
    _log.info('using random seed %s', seed)

    ddir = data_dir / options.data
    rating_file = ddir / 'ratings.parquet'
    if options.train_data == 'all':
        mdir = ddir / 'models'
    elif options.train_data == 'eval':
        mdir = ddir / 'eval'
    else:
        raise ValueError(f'unknown training data {options.train_data}')

    mdir.mkdir(parents=True, exist_ok=True)
    mfn = mdir / f'{options.algo_fn}.model'
    if options.default:
        _log.warn('Using default settings')
        opt_fn = None
    else:
        opt_fn = ddir / 'tuning' / f'{options.algo_fn}.json'
        _log.info('Using algorithm optimization results %s', opt_fn)

    with LogFile(mdir / f'{options.algo_fn}.log'):
        _log.info('reading ratings from %s', rating_file)
        ratings = pd.read_parquet(rating_file)
        if options.drop_ratings and 'rating' in ratings.columns:
            _log.info('dropping rating column')
            ratings = ratings.drop(columns=['rating'])
        if options.train_data == 'eval':
            _log.info('reading test data')
            test = pd.read_parquet(ddir / 'eval' / 'test-ratings.parquet')
            train_mask = pd.Series(True, index=ratings.index)
            train_mask[test.index] = False
            ratings = ratings[train_mask].copy().reset_index(drop=True)

        implicit = 'rating' not in ratings.columns

        _log.info('loading algorithm %s for %s in %s mode', options.data,
                  options.algo, 'implicit' if implicit else 'explicit')
        algo = get_algorithm(options.data, options.algo, opt_fn, implicit)
        algo = Recommender.adapt(algo)

        _log.info('training %s on %s ratings', algo, len(ratings))
        timer = Stopwatch()
        model = algo.fit(ratings)
        timer.stop()
        _log.info('trained in %s', timer)
        _log.info('saving model to %s', mfn)
        with open(mfn, 'wb') as f:
            p = dt.CompactingPickler(f, protocol=4)
            p.dump(model)
예제 #4
0
def sample(options):
    data = dt.fname(options.data)

    seed = init_rng(rng_seed(), 'sample-users', data)
    _log.info('using random seed %s', seed)

    ds = dt.datasets[data]

    kr_query = f'''
        SELECT r.user_id AS user, COUNT(book_id) AS profile_size
        FROM {ds.table} r
        JOIN cluster_first_author_gender g ON g.cluster = r.book_id
        WHERE gender = 'male' OR gender = 'female'
        GROUP BY r.user_id
        HAVING COUNT(book_id) >= {options.min_ratings}
    '''

    _log.info('loading users for %s', data)
    valid_users = pd.read_sql(kr_query, db_uri())
    _log.info('found %d viable profiles, sampling %d',
              len(valid_users), options.sample_size)
    sample = valid_users.sample(options.sample_size, random_state=rng(legacy=True))

    ddir = data_dir / data

    u_fn = ddir / 'sample-users.csv'
    _log.info('writing %s', u_fn)
    sample.to_csv(ddir / 'sample-users.csv', index=False)

    ratings = pd.read_parquet(ddir / 'ratings.parquet')
    ratings = pd.merge(sample[['user']], ratings)
    r_fn = ddir / 'sample-ratings.csv'
    _log.info('writing %d ratings to %s', len(ratings), r_fn)
    ratings.to_csv(r_fn, index=False)

    s_fn = ddir / 'sample-stats.json'
    _log.info('writing stats to %s', s_fn)
    s_fn.write_text(json.dumps({
        'viable': len(valid_users),
        'sampled': options.sample_size,
        'ratings': len(ratings)
    }))
예제 #5
0
def init_rng(request):
    util.init_rng(42)
예제 #6
0
def inspect(file):
    _log.info('loading file %s', file)
    opt = skopt.load(file)
    n = len(opt.x_iters)
    print('iterations:', n)
    print('optimal HR:', -opt.fun)
    for i in range(n):
        x = opt.x_iters[i]
        nhr = opt.func_vals[i]
        if hasattr(opt, 'iter_time'):
            time = opt.iter_time[i]
        else:
            time = np.nan
        print('iter[{}]: {!r} -> {:f} ({:.1f}s)'.format(i, x, -nhr, time))


if __name__ == '__main__':
    options = docopt(__doc__)
    insp_file = options['--inspect']
    if insp_file:
        inspect(insp_file)
    else:
        seed = init_rng(rng_seed(), 'search-model', options['<dataset>'],
                        options['--out-name'])
        _log.info('using random seed %s', seed)
        eval = setup(options['<dataset>'], options['<algorithm>'],
                     options['--drop-ratings'])
        run_search(options['<dataset>'], options['<algorithm>'],
                   options['--out-name'], eval)