def export_work_ratings():
    query = '''
    SELECT gr_user_rid AS user, gr_work_id AS item,
            MEDIAN(rating) AS rating,
            (array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating,
            MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp,
            COUNT(rating) AS nratings
     FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
     WHERE rating > 0 AND gr_work_id IS NOT NULL
     GROUP BY gr_user_rid, gr_work_id
     ORDER BY MIN(date_updated)
    '''

    with db.connect() as dbc:
        _log.info('reading book ratings')
        ratings = db.load_table(dbc,
                                query,
                                dtype={
                                    'user': '******',
                                    'item': 'i4',
                                    'rating': 'f4',
                                    'last_rating': 'f4',
                                    'nratings': 'i4'
                                })

    _log.info('writing ratings')
    ratings.to_parquet('gr-work-ratings.parquet',
                       index=False,
                       compression='zstd',
                       compression_level=5)
示例#2
0
def cluster(txout):
    "Cluster ISBNs"
    with db.connect() as dbc, dbc:
        tracking.begin_stage(dbc, 'cluster')

        with db.engine().connect() as cxn:
            _log.info('loading graph')
            gl = GraphLoader()
            g = gl.load_graph(cxn, False)

        print('NODES', g.num_vertices(), file=txout)
        print('EDGES', g.num_edges(), file=txout)

        _log.info('finding connected components')
        comps, hist = label_components(g)
        _log.info('found %d components, largest has %s items', len(hist),
                  np.max(hist))
        print('COMPONENTS', len(hist), file=txout)

        _log.info('saving cluster records to database')
        is_isbn = g.vp.source.a == ns_isbn.code
        clusters = pd.DataFrame({
            'isbn_id': g.vp.label.a[is_isbn],
            'cluster': comps.a[is_isbn]
        })
        _import_clusters(dbc, clusters)

        _log.info('saving ID graph')
        g.vp['cluster'] = comps
        g.save('data/id-graph.gt')

        c_hash = _hash_frame(clusters)
        print('WRITE CLUSTERS', c_hash, file=txout)

        tracking.end_stage(dbc, 'cluster', c_hash)
def export_work_actions():
    query = '''
    SELECT gr_user_rid AS user, gr_work_id AS item,
            COUNT(rating) AS nactions,
            MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time,
            MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time
     FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
     WHERE gr_work_id IS NOT NULL
     GROUP BY gr_user_rid, gr_work_id
     ORDER BY MIN(date_updated)
    '''

    with db.connect() as dbc:
        _log.info('reading book shelf actions')
        actions = db.load_table(dbc,
                                query,
                                dtype={
                                    'user': '******',
                                    'item': 'i4',
                                    'nactions': 'i4'
                                })

    _log.info('writing actions')
    actions.to_parquet('gr-work-actions.parquet',
                       index=False,
                       compression='zstd',
                       compression_level=5)
def export_work_titles():
    query = f'''
        SELECT gr_work_id AS work_id, gr_work_rid, work_title
        FROM gr.work_title
        ORDER BY gr_work_rid
    '''

    with db.connect() as dbc:
        _log.info('reading work titles')
        books = db.load_table(dbc, query)

    pq_fn = 'gr-work-titles.parquet'
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False)
    _log.info('writing CSV')
    books.to_csv('gr-work-titles.csv.gz', index=False)
def export_work_genres():
    query = f'''
        SELECT gr_work_id AS work_id, genre, sum(score::int) AS score
        FROM gr.book_ids JOIN gr.book_genres USING (gr_book_rid)
        GROUP BY work_id, genre
        ORDER BY work_id, genre
    '''

    with db.connect() as dbc:
        _log.info('reading work genres')
        genres = db.load_table(dbc, query)

    pq_fn = 'gr-work-genres.parquet'
    _log.info('writing parquet to %s', pq_fn)
    genres.to_parquet(pq_fn, index=False, compression='brotli')
    _log.info('writing CSV')
    genres.to_csv('gr-work-genres.csv.gz', index=False)
def export_book_ids():
    query = '''
        SELECT gr_book_rid, gr_book_id, gr_work_id, cluster
        FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
        ORDER BY gr_book_rid
    '''

    with db.connect() as dbc:
        _log.info('reading book IDs')
        books = db.load_table(dbc, query)

    csv_fn = 'gr-book-ids.csv.gz'
    pq_fn = 'gr-book-ids.parquet'
    _log.info('writing CSV to %s', csv_fn)
    books.to_csv(csv_fn, index=False)
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False)
def export_work_authors():
    query = f'''
        WITH
            pairs AS (SELECT DISTINCT gr_work_id AS work_id, gr_author_id
                      FROM gr.book_ids JOIN gr.book_authors USING (gr_book_rid)
                      WHERE author_role = '' AND gr_work_id IS NOT NULL)
        SELECT work_id, gr_author_id AS author_id, author_name
        FROM pairs JOIN gr.author_info USING (gr_author_id)
        ORDER BY work_id
    '''

    with db.connect() as dbc:
        _log.info('reading work authors')
        books = db.load_table(dbc, query)

    pq_fn = 'gr-work-authors.parquet'
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False, compression='brotli')
    _log.info('writing CSV')
    books.to_csv('gr-work-authors.csv.gz', index=False)
示例#8
0
 def __call__(self, ids):
     with db.connect() as dbc, dbc.cursor() as cur:
         if ids:
             return [self._id_rec(cur, r) for r in ids]
         else:
             return [self._top_rec(cur)]
示例#9
0
if tfile:
    tfile = Path(tfile)
else:
    tfile = script_file.with_suffix('.transcript')

stage = opts.get('-s', None)
if not stage:
    stage = script_file.stem

_log.info('reading %s', script_file)
script = db.SqlScript(script_file)
_log.info('%s has %d chunks', script_file, len(script.chunks))
if opts.get('--dry-run'):
    script.describe()
else:
    with tfile.open('w') as txf, db.connect() as dbc:
        key = hashlib.md5()
        with dbc, dbc.cursor() as cur:
            tracking.begin_stage(cur, stage)
            for dep in script.deps:
                dhs = tracking.record_dep(cur, stage, dep)
                # hash the dependency hashes
                for d, h in dhs: key.update(h.encode('utf-8'))
            h = tracking.hash_and_record_file(cur, script_file, stage)
            # hash the source file
            key.update(h.encode('utf-8'))
        script.execute(dbc, transcript=txf)

        with dbc, dbc.cursor() as cur:
            for ns, tbl in script.tables:
                oid, kind = tracking.record_tbl(cur, stage, ns, tbl)
示例#10
0
in_chk = hashlib.sha1(data).hexdigest()

barr = np.frombuffer(data, dtype='u1')
# delete bytes that are too big
barr = barr[barr < 128]
# convert to LF
barr = barr[barr != ord('\r')]
# change delimiter to comma
barr[barr == ord(';')] = ord(',')

# write
_log.info('importing BX to database')
data = bytes(barr)
rd = StringIO(data.decode('utf8'))

with db.connect() as dbc:
    print('IMPORT TO bx.raw_ratings', file=tx_file)
    print('READ', src_file, in_chk, file=tx_file)
    # we're going to hash the data we insert
    dh = hashlib.md5()
    # with dbc encapsulates a transaction
    with dbc, dbc.cursor() as cur:
        tracking.begin_stage(cur, 'bx-ratings')
        tracking.record_file(cur, src_file, in_chk, 'bx-ratings')
        tracking.record_dep(cur, 'bx-ratings', 'bx-schema')
        n = 0
        for row in tqdm(csv.DictReader(rd)):
            uid = row['User-ID']
            isbn = row['ISBN']
            rating = row['Book-Rating']
            cur.execute(