def split(opts: OptionReader):
    data = dt.fname(opts.data)
    ddir = data_dir / data
    tdir = ddir / opts.subdir

    seed = init_rng(rng_seed(), 'split-ratings', data, opts.subdir)
    _log.info('using random seed %s', seed)

    _log.info('reading ratings')
    ratings = pd.read_parquet(ddir / 'ratings.parquet')
    _log.info('counting users in %d ratings', len(ratings))
    users = ratings.groupby('user')['item'].count()
    candidates = users[users >= opts.min_ratings]

    _log.info('selecting %d of %d candidate users (%d total)', opts.test_users,
              len(candidates), len(users))
    sample = candidates.sample(opts.test_users, random_state=rng(legacy=True))

    _log.info('selecting test ratings')
    u_rates = ratings[ratings['user'].isin(sample.index)]
    test = u_rates.groupby('user').apply(lambda df: df.sample(1))
    test.reset_index('user', drop=True, inplace=True)

    _log.info('writing %d test ratings', len(test))
    test.to_parquet(tdir / 'test-ratings.parquet', compression='snappy')
Exemplo n.º 2
0
Arquivo: util.py Projeto: yw4509/lkpy
def make_graph(rng_spec=None):
    "Construct a TensorFlow graph (with an optional random seed)"
    rng = util.rng(rng_spec)
    graph = tf.Graph()
    graph.seed = rng.integers(2**31 - 1)
    _log.info('using effective random seed %s (from %s)', graph.seed, rng_spec)
    return graph
Exemplo n.º 3
0
def test_als_method_match():
    lu = als.BiasedMF(20,
                      iterations=15,
                      reg=(2, 0.001),
                      method='lu',
                      rng_spec=42)
    cd = als.BiasedMF(20,
                      iterations=20,
                      reg=(2, 0.001),
                      method='cd',
                      rng_spec=42)

    ratings = lktu.ml_test.ratings

    timer = Stopwatch()
    lu.fit(ratings)
    timer.stop()
    _log.info('fit with LU solver in %s', timer)

    timer = Stopwatch()
    cd.fit(ratings)
    timer.stop()
    _log.info('fit with CD solver in %s', timer)

    assert lu.global_bias_ == approx(ratings.rating.mean())
    assert cd.global_bias_ == approx(ratings.rating.mean())

    preds = []

    rng = util.rng(42, legacy=True)
    for u in rng.choice(np.unique(ratings.user), 15, replace=False):
        items = rng.choice(np.unique(ratings.item), 15, replace=False)
        lu_preds = lu.predict_for_user(u, items)
        cd_preds = cd.predict_for_user(u, items)
        diff = lu_preds - cd_preds
        adiff = np.abs(diff)
        _log.info(
            'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f',
            u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff),
            np.max(adiff), np.quantile(adiff, 0.9))

        preds.append(
            pd.DataFrame({
                'user': u,
                'item': items,
                'lu': lu_preds,
                'cd': cd_preds,
                'adiff': adiff
            }))

    preds = pd.concat(preds, ignore_index=True)
    _log.info('LU preds:\n%s', preds.lu.describe())
    _log.info('CD preds:\n%s', preds.cd.describe())
    _log.info('overall differences:\n%s', preds.adiff.describe())
    # there are differences. our check: the 90% are under a quarter star
    assert np.quantile(adiff, 0.9) <= 0.25
Exemplo n.º 4
0
def test_als_method_match():
    lu = als.ImplicitMF(20, iterations=15, method='lu', rng_spec=42)
    cg = als.ImplicitMF(20, iterations=15, method='cg', rng_spec=42)

    ratings = lktu.ml_test.ratings

    timer = Stopwatch()
    lu.fit(ratings)
    timer.stop()
    _log.info('fit with LU solver in %s', timer)

    timer = Stopwatch()
    cg.fit(ratings)
    timer.stop()
    _log.info('fit with CG solver in %s', timer)

    preds = []

    rng = util.rng(42, legacy=True)
    for u in rng.choice(ratings.user.unique(), 10, replace=False):
        items = rng.choice(ratings.item.unique(), 15, replace=False)
        lu_preds = lu.predict_for_user(u, items)
        cd_preds = cg.predict_for_user(u, items)
        diff = lu_preds - cd_preds
        adiff = np.abs(diff)
        _log.info(
            'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f',
            u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff),
            np.max(adiff), np.quantile(adiff, 0.9))

        preds.append(
            pd.DataFrame({
                'user': u,
                'item': items,
                'lu': lu_preds,
                'cg': cd_preds,
                'adiff': adiff
            }))
        _log.info('user %s tau: %s', u, stats.kendalltau(lu_preds, cd_preds))

    preds = pd.concat(preds, ignore_index=True)
    _log.info('LU preds:\n%s', preds.lu.describe())
    _log.info('CD preds:\n%s', preds.cg.describe())
    _log.info('overall differences:\n%s', preds.adiff.describe())
    # there are differences. our check: the 90% are reasonable
    assert np.quantile(adiff, 0.9) < 0.5
Exemplo n.º 5
0
def sample(options):
    data = dt.fname(options.data)

    seed = init_rng(rng_seed(), 'sample-users', data)
    _log.info('using random seed %s', seed)

    ds = dt.datasets[data]

    kr_query = f'''
        SELECT r.user_id AS user, COUNT(book_id) AS profile_size
        FROM {ds.table} r
        JOIN cluster_first_author_gender g ON g.cluster = r.book_id
        WHERE gender = 'male' OR gender = 'female'
        GROUP BY r.user_id
        HAVING COUNT(book_id) >= {options.min_ratings}
    '''

    _log.info('loading users for %s', data)
    valid_users = pd.read_sql(kr_query, db_uri())
    _log.info('found %d viable profiles, sampling %d',
              len(valid_users), options.sample_size)
    sample = valid_users.sample(options.sample_size, random_state=rng(legacy=True))

    ddir = data_dir / data

    u_fn = ddir / 'sample-users.csv'
    _log.info('writing %s', u_fn)
    sample.to_csv(ddir / 'sample-users.csv', index=False)

    ratings = pd.read_parquet(ddir / 'ratings.parquet')
    ratings = pd.merge(sample[['user']], ratings)
    r_fn = ddir / 'sample-ratings.csv'
    _log.info('writing %d ratings to %s', len(ratings), r_fn)
    ratings.to_csv(r_fn, index=False)

    s_fn = ddir / 'sample-stats.json'
    _log.info('writing stats to %s', s_fn)
    s_fn.write_text(json.dumps({
        'viable': len(valid_users),
        'sampled': options.sample_size,
        'ratings': len(ratings)
    }))
Exemplo n.º 6
0
Arquivo: bpr.py Projeto: yw4509/lkpy
    def fit(self, ratings, **kwargs):
        timer = util.Stopwatch()
        rng = util.rng(self.rng_spec)

        matrix, users, items = sparse_ratings(ratings[['user', 'item']])

        _log.info('[%s] setting up model', timer)
        train, model = self._build_model(len(users), len(items))

        _log.info('[%s] preparing training dataset', timer)
        train_data = BprInputs(matrix, self.batch_size, self.neg_count, rng)

        _log.info('[%s] training model', timer)
        train.fit(train_data, epochs=self.epochs)

        _log.info('[%s] model finished', timer)

        self.user_index_ = users
        self.item_index_ = items
        self.model = model

        return self
Exemplo n.º 7
0
def legacy_rng():
    return util.rng(42, legacy_rng=True)
Exemplo n.º 8
0
def rng():
    return util.rng(42)