def _pop(cls, blocks): query("START TRANSACTION") for block in blocks: num = block['num'] date = block['date'] print("[FORK] popping block %d @ %s" % (num, date)) assert num == cls.head_num(), "can only pop head block" # get all affected post_ids in this block sql = "SELECT id FROM hive_posts WHERE created_at >= :date" post_ids = tuple(query_col(sql, date=date)) # remove all recent records query("DELETE FROM hive_posts_cache WHERE post_id IN :ids", ids=post_ids) query("DELETE FROM hive_feed_cache WHERE created_at >= :date", date=date) query("DELETE FROM hive_reblogs WHERE created_at >= :date", date=date) query("DELETE FROM hive_follows WHERE created_at >= :date", date=date) #* query("DELETE FROM hive_post_tags WHERE post_id IN :ids", ids=post_ids) query("DELETE FROM hive_posts WHERE id IN :ids", ids=post_ids) query("DELETE FROM hive_blocks WHERE num = :num", num=num) query("COMMIT") print("[FORK] recovery complete")
def pids_by_blog_by_index(account: str, start_index: int, limit: int = 20): """Get post_ids for an author's blog (w/ reblogs), paged by index/limit. Examples: (acct, 2) = returns blog entries 0 up to 2 (3 oldest) (acct, 0) = returns all blog entries (limit 0 means return all?) (acct, 2, 1) = returns 1 post starting at idx 2 (acct, 2, 3) = returns 3 posts: idxs (2,1,0) """ sql = """ SELECT post_id FROM hive_feed_cache WHERE account_id = :account_id ORDER BY created_at LIMIT :limit OFFSET :offset """ account_id = _get_account_id(account) offset = start_index - limit + 1 assert offset >= 0, 'start_index and limit combination is invalid' ids = query_col(sql, account_id=account_id, limit=limit, offset=offset) return list(reversed(ids))
def pids_by_blog(account: str, start_author: str = '', start_permlink: str = '', limit: int = 20): account_id = get_account_id(account) seek = '' if start_permlink: seek = """ AND created_at <= ( SELECT created_at FROM hive_feed_cache WHERE account_id = :account_id AND post_id = %d) """ % get_post_id(start_author, start_permlink) sql = """ SELECT post_id FROM hive_feed_cache WHERE account_id = :account_id %s ORDER BY created_at DESC LIMIT :limit """ % seek return query_col(sql, account_id=account_id, limit=limit)
async def get_followers(account: str, start: str, follow_type: str, limit: int): limit = _validate_limit(limit, 1000) state = _follow_type_to_int(follow_type) account_id = _get_account_id(account) seek = '' if start: seek = """ AND hf.created_at <= ( SELECT created_at FROM hive_follows WHERE following = :account_id AND follower = %d AND state = :state) """ % _get_account_id(start) sql = """ SELECT name FROM hive_follows hf JOIN hive_accounts ON hf.follower = id WHERE hf.following = :account_id AND state = :state %s ORDER BY hf.created_at DESC LIMIT :limit """ % seek res = query_col(sql, account_id=account_id, state=state, limit=int(limit)) return [ dict(follower=r, following=account, what=[follow_type]) for r in res ]
def _get_discussions(sort, start_author, start_permlink, limit, tag): limit = _validate_limit(limit, 20) col = '' where = [] if sort == 'trending': col = 'sc_trend' elif sort == 'hot': col = 'sc_hot' elif sort == 'created': col = 'post_id' where.append('depth = 0') elif sort == 'promoted': col = 'promoted' where.append("is_paidout = '0'") where.append('promoted > 0') else: raise Exception("unknown sort order {}".format(sort)) if tag: tagged_posts = "SELECT post_id FROM hive_post_tags WHERE tag = :tag" where.append("post_id IN (%s)" % tagged_posts) start_id = None if start_permlink: start_id = _get_post_id(start_author, start_permlink) sql = ("SELECT %s FROM hive_posts_cache %s ORDER BY %s DESC LIMIT 1" % (col, _where([*where, "post_id = :start_id"]), col)) where.append("%s <= (%s)" % (col, sql)) sql = ( "SELECT post_id FROM hive_posts_cache %s ORDER BY %s DESC LIMIT :limit" % (_where(where), col)) ids = query_col(sql, tag=tag, start_id=start_id, limit=limit) return _get_posts(ids)
def pids_by_blog(account: str, start_author: str = '', start_permlink: str = '', limit: int = 20): """Get a list of post_ids for an author's blog.""" account_id = _get_account_id(account) seek = '' if start_permlink: start_id = _get_post_id(start_author, start_permlink) if not start_id: return [] seek = """ AND created_at <= ( SELECT created_at FROM hive_feed_cache WHERE account_id = :account_id AND post_id = %d) """ % start_id sql = """ SELECT post_id FROM hive_feed_cache WHERE account_id = :account_id %s ORDER BY created_at DESC LIMIT :limit """ % seek return query_col(sql, account_id=account_id, limit=limit)
async def get_blog_feed(account: str, skip: int, limit: int, context: str = None): """Get a blog feed (posts and reblogs from the specified account)""" account_id = _get_account_id(account) sql = ("SELECT post_id FROM hive_feed_cache WHERE account_id = :account_id " "ORDER BY created_at DESC LIMIT :limit OFFSET :skip") post_ids = query_col(sql, account_id=account_id, skip=skip, limit=limit) return _get_posts(post_ids, context)
def pids_by_replies_to_account(start_author: str, start_permlink: str = '', limit: int = 20): """Get a list of post_ids representing replies to an author.""" seek = '' if start_permlink: sql = """ SELECT parent.author, child.created_at FROM hive_posts child JOIN hive_posts parent ON child.parent_id = parent.id WHERE child.author = :author AND child.permlink = :permlink """ account, start_date = query_row(sql, author=start_author, permlink=start_permlink) seek = "AND created_at <= '%s'" % start_date else: account = start_author sql = """ SELECT id FROM hive_posts WHERE parent_id IN (SELECT id FROM hive_posts WHERE author = :parent) %s ORDER BY created_at DESC LIMIT :limit """ % seek return query_col(sql, parent=account, limit=limit)
async def _get_top_trending_tags(): """Get top 50 trending tags among pending posts.""" sql = """ SELECT category FROM hive_posts_cache WHERE is_paidout = '0' GROUP BY category ORDER BY SUM(payout) DESC LIMIT 50 """ return query_col(sql)
async def get_replies_by_last_update(start_author: str, start_permlink: str = '', limit: int = 20): limit = _validate_limit(limit, 50) parent = start_author seek = '' if start_permlink: parent, start_date = query_row(""" SELECT p.author, c.created_at FROM hive_posts c JOIN hive_posts p ON c.parent_id = p.id WHERE c.author = :a AND c.permlink = :p """, a=start_author, p=start_permlink) seek = "AND created_at <= '%s'" % start_date sql = """ SELECT id FROM hive_posts WHERE parent_id IN (SELECT id FROM hive_posts WHERE author = :parent) %s ORDER BY created_at DESC LIMIT :limit """ % seek ids = query_col(sql, parent=parent, limit=limit) return _get_posts(ids)
async def get_followers(account: str, skip: int, limit: int): account_id = _get_account_id(account) sql = """ SELECT name FROM hive_follows hf JOIN hive_accounts ON hf.follower = id WHERE hf.following = :account_id AND state = 1 ORDER BY hf.created_at DESC LIMIT :limit OFFSET :skip """ return query_col(sql, account_id=account_id, skip=skip, limit=limit)
def _pop(cls, blocks): """Pop head blocks to navigate head to a point prior to fork. Without an undo database, there is a limit to how fully we can recover. If consistency is critical, run hive with TRAIL_BLOCKS=-1 to only index up to last irreversible. Otherwise use TRAIL_BLOCKS=2 to stay closer while avoiding the vast majority of microforks. As-is, there are a few caveats with the following strategy: - follow counts can get out of sync (hive needs to force-recount) - follow state could get out of sync (user-recoverable) For 1.5, also need to handle: - hive_communities - hive_members - hive_flags - hive_modlog """ query("START TRANSACTION") for block in blocks: num = block['num'] date = block['date'] log.warning("[FORK] popping block %d @ %s", num, date) assert num == cls.head_num(), "can only pop head block" # get all affected post_ids in this block sql = "SELECT id FROM hive_posts WHERE created_at >= :date" post_ids = tuple(query_col(sql, date=date)) # remove all recent records query("DELETE FROM hive_posts_cache WHERE post_id IN :ids", ids=post_ids) query("DELETE FROM hive_feed_cache WHERE created_at >= :date", date=date) query("DELETE FROM hive_reblogs WHERE created_at >= :date", date=date) query("DELETE FROM hive_follows WHERE created_at >= :date", date=date) #* query("DELETE FROM hive_post_tags WHERE post_id IN :ids", ids=post_ids) query("DELETE FROM hive_posts WHERE id IN :ids", ids=post_ids) query("DELETE FROM hive_payments WHERE block_num = :num", num=num) query("DELETE FROM hive_blocks WHERE num = :num", num=num) query("COMMIT") log.warning("[FORK] recovery complete")
async def get_top_trending_tags_summary(): """Get top 50 trending tags among pending posts.""" # Same results, more overhead: #return [tag['name'] for tag in await get_trending_tags('', 50)] sql = """ SELECT category FROM hive_posts_cache WHERE is_paidout = '0' GROUP BY category ORDER BY SUM(payout) DESC LIMIT 50 """ return query_col(sql)
async def get_related_posts(account: str, permlink: str): sql = """ SELECT p2.id FROM hive_posts p1 JOIN hive_posts p2 ON p1.category = p2.category JOIN hive_posts_cache pc ON p2.id = pc.post_id WHERE p1.author = :a AND p1.permlink = :p AND sc_trend > :t AND p1.id != p2.id ORDER BY sc_trend DESC LIMIT 5 """ thresh = time.time() / 480000 post_ids = query_col(sql, a=account, p=permlink, t=thresh) return _get_posts(post_ids)
def _load_posts_recursive(post_ids): posts = _get_posts(post_ids) out = {} for post, post_id in zip(posts, post_ids): out[post['author'] + '/' + post['permlink']] = post child_ids = query_col("SELECT id FROM hive_posts WHERE parent_id = %d" % post_id) if child_ids: children = _load_posts_recursive(child_ids) post['replies'] = list(children.keys()) out = {**out, **children} return out
def get_following(account: str, start: str, state: int, limit: int): """Get a list of accounts followed by a given account.""" account_id = _get_account_id(account) seek = "AND name >= :start" if start else '' sql = """ SELECT name FROM hive_follows hf LEFT JOIN hive_accounts ON hf.following = id WHERE hf.follower = :account_id AND state = :state %s ORDER BY name ASC LIMIT :limit """ % seek return query_col(sql, account_id=account_id, start=start, state=state, limit=limit)
def _get_posts(ids, context=None): sql = """ SELECT post_id, author, permlink, title, preview, img_url, payout, promoted, created_at, payout_at, is_nsfw, rshares, votes, json FROM hive_posts_cache WHERE post_id IN :ids """ reblogged_ids = [] if context: reblogged_ids = query_col( "SELECT post_id FROM hive_reblogs " "WHERE account = :a AND post_id IN :ids", a=context, ids=tuple(ids)) # key by id so we can return sorted by input order posts_by_id = {} for row in query_all(sql, ids=tuple(ids)): obj = dict(row) if context: voters = [csa.split(",")[0] for csa in obj['votes'].split("\n")] obj['user_state'] = { 'reblogged': row['post_id'] in reblogged_ids, 'voted': context in voters } # TODO: Object of type 'Decimal' is not JSON serializable obj['payout'] = float(obj['payout']) obj['promoted'] = float(obj['promoted']) # TODO: Object of type 'datetime' is not JSON serializable obj['created_at'] = str(obj['created_at']) obj['payout_at'] = str(obj['payout_at']) obj.pop('votes') # temp obj.pop('json') # temp posts_by_id[row['post_id']] = obj # in rare cases of cache inconsistency, recover and warn missed = set(ids) - posts_by_id.keys() if missed: print("WARNING: _get_posts do not exist in cache: {}".format(missed)) for _id in missed: ids.remove(_id) return [posts_by_id[_id] for _id in ids]
async def get_discussions_by_comments(start_author: str, start_permlink: str = '', limit: int = 20): limit = _validate_limit(limit, 20) seek = '' if start_permlink: seek = """ AND created_at <= (SELECT created_at FROM hive_posts WHERE id = %d) """ % _get_post_id(start_author, start_permlink) sql = """ SELECT id FROM hive_posts WHERE author = :account %s AND depth > 0 ORDER BY created_at DESC LIMIT :limit """ % seek ids = query_col(sql, account=start_author, limit=limit) return _get_posts(ids)
def pids_by_account_comments(account: str, start_permlink: str = '', limit: int = 20): seek = '' if start_permlink: seek = """ AND created_at <= (SELECT created_at FROM hive_posts WHERE id = %d) """ % get_post_id(account, start_permlink) sql = """ SELECT id FROM hive_posts WHERE author = :account %s AND depth > 0 ORDER BY created_at DESC LIMIT :limit """ % seek return query_col(sql, account=account, limit=limit)
def pids_by_query(sort, start_author, start_permlink, limit, tag): """Get a list of post_ids for a given posts query. `sort` can be trending, hot, new, promoted. """ assert sort in ['trending', 'hot', 'created', 'promoted'] col = '' where = [] if sort == 'trending': col = 'sc_trend' elif sort == 'hot': col = 'sc_hot' elif sort == 'created': col = 'post_id' where.append('depth = 0') elif sort == 'promoted': col = 'promoted' where.append("is_paidout = '0'") where.append('promoted > 0') if tag: tagged_pids = "SELECT post_id FROM hive_post_tags WHERE tag = :tag" where.append("post_id IN (%s)" % tagged_pids) def _where(conditions): return 'WHERE ' + ' AND '.join(conditions) if conditions else '' start_id = None if start_permlink: start_id = _get_post_id(start_author, start_permlink) if not start_id: return [] sql = ("SELECT %s FROM hive_posts_cache %s ORDER BY %s DESC LIMIT 1" % (col, _where([*where, "post_id = :start_id"]), col)) where.append("%s <= (%s)" % (col, sql)) sql = ( "SELECT post_id FROM hive_posts_cache %s ORDER BY %s DESC LIMIT :limit" % (_where(where), col)) return query_col(sql, tag=tag, start_id=start_id, limit=limit)
def pids_by_blog_without_reblog(account: str, start_permlink: str = '', limit: int = 20): """Get a list of post_ids for an author's blog without reblogs.""" seek = '' if start_permlink: start_id = _get_post_id(account, start_permlink) if not start_id: return [] seek = "AND id <= %d" % start_id sql = """ SELECT id FROM hive_posts WHERE author = :account %s AND is_deleted = '0' AND depth = 0 ORDER BY id DESC LIMIT :limit """ % seek return query_col(sql, account=account, limit=limit)
def cache_all_accounts(): accounts = query_col("SELECT name FROM hive_accounts") processed = 0 total = len(accounts) for i in range(0, total, 1000): batch = accounts[i:i + 1000] lap_0 = time.time() sqls = generate_cached_accounts_sql(batch) lap_1 = time.time() batch_queries(sqls) lap_2 = time.time() processed += len(batch) rem = total - processed rate = len(batch) / (lap_2 - lap_0) pct_db = int(100 * (lap_2 - lap_1) / (lap_2 - lap_0)) print(" -- {} of {} ({}/s, {}% db) -- {}m remaining".format( processed, total, round(rate, 1), pct_db, round(rem / rate / 60, 2)))
async def get_discussions_by_sort_and_tag(sort, tag, skip, limit, context=None): if skip > 5000: raise Exception("cannot skip {} results".format(skip)) if limit > 100: raise Exception("cannot limit {} results".format(limit)) order = '' where = [] if sort == 'trending': order = 'sc_trend DESC' elif sort == 'hot': order = 'sc_hot DESC' elif sort == 'new': order = 'post_id DESC' where.append('depth = 0') elif sort == 'promoted': order = 'promoted DESC' where.append('is_paidout = 0') where.append('promoted > 0') else: raise Exception("unknown sort order {}".format(sort)) if tag: where.append( 'post_id IN (SELECT post_id FROM hive_post_tags WHERE tag = :tag)') if where: where = 'WHERE ' + ' AND '.join(where) else: where = '' sql = "SELECT post_id FROM hive_posts_cache %s ORDER BY %s LIMIT :limit OFFSET :skip" % ( where, order) ids = query_col(sql, tag=tag, limit=limit, skip=skip) return _get_posts(ids, context)
def pids_by_replies_to_account(start_author: str, start_permlink: str = '', limit: int = 20): """Get a list of post_ids representing replies to an author. To get the first page of results, specify `start_author` as the account being replied to. For successive pages, provide the last loaded reply's author/permlink. """ seek = '' if start_permlink: sql = """ SELECT parent.author, child.created_at FROM hive_posts child JOIN hive_posts parent ON child.parent_id = parent.id WHERE child.author = :author AND child.permlink = :permlink """ row = query_row(sql, author=start_author, permlink=start_permlink) if not row: return [] parent_account = row[0] seek = "AND created_at <= '%s'" % row[1] else: parent_account = start_author sql = """ SELECT id FROM hive_posts WHERE parent_id IN (SELECT id FROM hive_posts WHERE author = :parent) %s AND is_deleted = '0' ORDER BY created_at DESC LIMIT :limit """ % seek return query_col(sql, parent=parent_account, limit=limit)
async def get_discussions_by_blog(tag: str, start_author: str = '', start_permlink: str = '', limit: int = 20): limit = _validate_limit(limit, 20) account_id = _get_account_id(tag) seek = '' if start_permlink: seek = """ AND created_at <= ( SELECT created_at FROM hive_feed_cache WHERE account_id = :account_id AND post_id = %d) """ % _get_post_id(start_author, start_permlink) sql = """ SELECT post_id FROM hive_feed_cache WHERE account_id = :account_id %s ORDER BY created_at DESC LIMIT :limit """ % seek ids = query_col(sql, account_id=account_id, limit=limit) return _get_posts(ids)
def pids_by_account_comments(account: str, start_permlink: str = '', limit: int = 20): """Get a list of post_ids representing comments by an author.""" seek = '' if start_permlink: start_id = _get_post_id(account, start_permlink) if not start_id: return [] seek = """ AND created_at <= (SELECT created_at FROM hive_posts WHERE id = %d) """ % start_id sql = """ SELECT id FROM hive_posts WHERE author = :account %s AND depth > 0 AND is_deleted = '0' ORDER BY created_at DESC LIMIT :limit """ % seek return query_col(sql, account=account, limit=limit)
def get_following(account: str, start: str, state: int, limit: int): account_id = get_account_id(account) seek = '' if start: seek = """ AND hf.created_at <= ( SELECT created_at FROM hive_follows WHERE follower = :account_id AND following = %d AND state = :state) """ % get_account_id(start) sql = """ SELECT name FROM hive_follows hf JOIN hive_accounts ON hf.following = id WHERE hf.follower = :account_id AND state = :state %s ORDER BY hf.created_at DESC LIMIT :limit """ % seek return query_col(sql, account_id=account_id, state=state, limit=limit)
def pids_by_query(sort, tag, start_author, start_permlink, limit): col = '' where = [] if sort == 'trending': col = 'sc_trend' elif sort == 'hot': col = 'sc_hot' elif sort == 'created': col = 'post_id' where.append('depth = 0') elif sort == 'promoted': col = 'promoted' where.append("is_paidout = '0'") where.append('promoted > 0') else: raise Exception("unknown sort order {}".format(sort)) if tag: tagged_pids = "SELECT post_id FROM hive_post_tags WHERE tag = :tag" where.append("post_id IN (%s)" % tagged_pids) def _where(conditions): return 'WHERE ' + ' AND '.join(conditions) if conditions else '' start_id = None if start_permlink: start_id = get_post_id(start_author, start_permlink) sql = ("SELECT %s FROM hive_posts_cache %s ORDER BY %s DESC LIMIT 1" % (col, _where([*where, "post_id = :start_id"]), col)) where.append("%s <= (%s)" % (col, sql)) sql = ( "SELECT post_id FROM hive_posts_cache %s ORDER BY %s DESC LIMIT :limit" % (_where(where), col)) return query_col(sql, tag=tag, start_id=start_id, limit=limit)
def dirty_oldest(cls, limit=50000): print("[HIVE] flagging %d oldest accounts for update" % limit) sql = "SELECT name FROM hive_accounts ORDER BY cached_at LIMIT :limit" return cls.dirty(query_col(sql, limit=limit))
async def get_content_replies(parent: str, parent_permlink: str): post_id = _get_post_id(parent, parent_permlink) post_ids = query_col("SELECT id FROM hive_posts WHERE parent_id = %d" % post_id) return _get_posts(post_ids)