def write_permacache_from_dir(dirname): # we want the whole list so that we can display accurate progress # information. If we're operating on more than tens of millions of # files, we should either bail out or tweak this to not need the # whole list at once allfiles = [] for root, dirs, files in os.walk(dirname): for f in files: allfiles.append(os.path.join(root, f)) for fname in progress(allfiles, persec=True): try: write_permacache_from_file(fname) os.unlink(fname) except: mr_tools.status("failed on %r" % fname) raise mr_tools.status("Removing empty directories") for root, dirs, files in os.walk(dirname, topdown=False): for d in dirs: dname = os.path.join(root, d) try: os.rmdir(dname) except OSError as e: if e.errno == errno.ENOTEMPTY: mr_tools.status("%s not empty" % (dname,)) else: raise
def rebuild_index(start_at=None, sleeptime=1, cls=Link, estimate=50000000, chunk_size=1000): if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(start_at) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % _REBUILD_INDEX_CACHE_KEY) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): for x in range(5): try: inject(chunk) except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(_REBUILD_INDEX_CACHE_KEY, last_update._fullname) time.sleep(sleeptime)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): doc_api = getattr(g, doc_api) uploader = uploader(doc_api) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def port_cassavotes(): from r2.models import Vote, Account, Link, Comment from r2.models.vote import CassandraVote, CassandraLinkVote, CassandraCommentVote from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, to36, progress ts = [(Vote.rel(Account, Link), CassandraLinkVote), (Vote.rel(Account, Comment), CassandraCommentVote)] dataattrs = set(['valid_user', 'valid_thing', 'ip', 'organic']) for prel, crel in ts: vq = prel._query(sort=desc('_date'), data=True, eager_load=False) vq = fetch_things2(vq) vq = progress(vq, persec=True) for v in vq: t1 = to36(v._thing1_id) t2 = to36(v._thing2_id) cv = crel(thing1_id = t1, thing2_id = t2, date=v._date, name=v._name) for dkey, dval in v._t.iteritems(): if dkey in dataattrs: setattr(cv, dkey, dval) cv._commit(write_consistency_level=CL.ONE)
def get_participated(): users = {} q = Account._query(Account.c.f2p != "", sort=asc("_date"), data=True) for user in progress(fetch_things2(q)): users[user._fullname] = user.f2p return users
def write_permacache_from_dir(dirname): for fname in progress(os.listdir(dirname), persec=True): try: fpath = os.path.join(dirname, fname) write_permacache_from_file(fpath) os.unlink(fpath) except: mr_tools.status('failed on %r' % fname) raise
def port_cassahides(): from r2.models import SaveHide, CassandraHide from r2.lib.db.tdb_cassandra import CL from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2, timeago, progress q = SaveHide._query(SaveHide.c._date > timeago("1 week"), SaveHide.c._name == "hide", sort=desc("_date")) q = fetch_things2(q) q = progress(q, estimate=1953374) for sh in q: CassandraHide._hide(sh._thing1, sh._thing2, write_consistency_level=CL.ONE)
def rebuild_index(after_id = None): cls = Link # don't pull spam/deleted q = cls._query(sort=desc('_date'), data=True) if after_id: q._after(cls._byID(after_id)) q = fetch_things2(q) q = progress(q, verbosity=1000, estimate=10000000, persec=True) for chunk in in_chunks(q): inject(chunk)
def port_cassasaves(after_id=None, estimate=12489897): from r2.models import SaveHide, CassandraSave from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, to36, progress q = SaveHide._query(SaveHide.c._name == "save", sort=desc("_date"), data=False, eager_load=False) if after_id is not None: q._after(SaveHide._byID(after_id)) for sh in progress(fetch_things2(q), estimate=estimate): csh = CassandraSave(thing1_id=to36(sh._thing1_id), thing2_id=to36(sh._thing2_id), date=sh._date) csh._commit(write_consistency_level=CL.ONE)
def give_trophies(users): for fullnames in in_chunks(progress(users, verbosity=50), size=50): users = Account._by_fullname(fullnames, return_dict=False) for user in users: team = get_user_team(user) trophy = Award.give_if_needed( codename="f2p_orangered" if team == "red" else "f2p_periwinkle", user=user, ) if trophy: trophy._commit() sleep(.5)
def port_deleted_links(after_id=None): from r2.models import Link from r2.lib.db.operators import desc from r2.models.query_cache import CachedQueryMutator from r2.lib.db.queries import get_deleted_links from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc("_date"), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def backfill_deleted_accounts(resume_id=None): del_accts = Account._query(Account.c._deleted == True, sort=desc('_date')) if resume_id: del_accts._filter(Account.c._id < resume_id) for i, account in enumerate(progress(fetch_things2(del_accts))): # Don't kill the rabbit! Wait for the relevant queues to calm down. if i % 1000 == 0: del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') while (del_len > 1000 or cs_len > 10000): sys.stderr.write(("CS: %d, DEL: %d" % (cs_len, del_len)) + "\n") sys.stderr.flush() time.sleep(1) del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') amqp.add_item('account_deleted', account._fullname)
def rebuild_index(after_id = None, estimate=10000000): cls = Link # don't pull spam/deleted q = cls._query(sort=desc('_date'), data=True) if after_id: q._after(cls._byID(after_id)) q = fetch_things2(q) def key(link): # we're going back in time, so this will give us a good idea # of how far we've gone return "%s/%s" % (link._id, link._date) q = progress(q, verbosity=1000, estimate=estimate, persec=True, key=key) for chunk in in_chunks(q): inject(chunk)
def _populate(after_id=None, estimate=54301242): from r2.models import desc from r2.lib.db import tdb_cassandra from r2.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc("_date")) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, "link_id"), chunk) update_comment_votes(chunk)
def rebuild_link_index( start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api="CLOUDSEARCH_DOC_API", estimate=50000000, chunk_size=1000, ): cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower() doc_api = getattr(g, doc_api) uploader = uploader(doc_api) if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(cache_key) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % cache_key) q = cls._query(cls.c._deleted == (True, False), sort=desc("_date"), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(cache_key, last_update._fullname) time.sleep(sleeptime)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrlAndSubreddit from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id,data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubreddit.add_link(l)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc("_date"), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, "url", "self") != "self" and not getattr(l, "is_self", False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})
def _populate(after_id=None, estimate=54301242): from r2.models import desc from r2.lib.db import tdb_cassandra from r2.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc('_date')) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, 'link_id'), chunk) add_comments(chunk)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower() doc_api = getattr(g, doc_api) uploader = uploader(doc_api) if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(cache_key) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % cache_key) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(cache_key, last_update._fullname) time.sleep(sleeptime)
def port_cassavotes(): from r2.models import Vote, Account, Link, Comment from r2.models.vote import CassandraVote, CassandraLinkVote, CassandraCommentVote from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, to36, progress ts = [(Vote.rel(Account, Link), CassandraLinkVote), (Vote.rel(Account, Comment), CassandraCommentVote)] dataattrs = set(['valid_user', 'valid_thing', 'ip', 'organic']) for prel, crel in ts: vq = prel._query(sort=desc('_date'), data=True, eager_load=False) vq = fetch_things2(vq) vq = progress(vq, persec=True) for v in vq: t1 = to36(v._thing1_id) t2 = to36(v._thing2_id) cv = crel(thing1_id=t1, thing2_id=t2, date=v._date, name=v._name) for dkey, dval in v._t.iteritems(): if dkey in dataattrs: setattr(cv, dkey, dval) cv._commit(write_consistency_level=CL.ONE)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})
# don't show user their own unread stuff if msg.author_id == account._id: return False return True resume_id = long(sys.argv[1]) if len(sys.argv) > 1 else None msg_accounts = Account._query(sort=desc("_date"), data=True) if resume_id: msg_accounts._filter(Account.c._id < resume_id) for account in progress(fetch_things2(msg_accounts), estimate=resume_id): current_inbox_count = account.inbox_count unread_messages = list(queries.get_unread_inbox(account)) if account._id % 100000 == 0: g.reset_caches() if not len(unread_messages): if current_inbox_count: account._incr('inbox_count', -current_inbox_count) else: msgs = Message._by_fullname( unread_messages, data=True, return_dict=False, ignore_missing=True,
# # The Original Developer is the Initial Developer. The Initial Developer of # the Original Code is reddit Inc. # # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit # Inc. All Rights Reserved. ############################################################################### """Ensure modmsgtime is properly set on all accounts. See the comment in Account.is_moderator_somewhere for possible values of this attribute now. """ from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2, progress from r2.models import Account, Subreddit all_accounts = Account._query(sort=desc("_date")) for account in progress(fetch_things2(all_accounts)): is_moderator_somewhere = bool(Subreddit.reverse_moderator_ids(account)) if is_moderator_somewhere: if not account.modmsgtime: account.modmsgtime = False else: # the account already has a date for modmsgtime meaning unread mail pass else: account.modmsgtime = None account._commit()
return False # don't show user their own unread stuff if msg.author_id == account._id: return False return True resume_id = long(sys.argv[1]) if len(sys.argv) > 1 else None msg_accounts = Account._query(sort=desc("_date"), data=True) if resume_id: msg_accounts._filter(Account.c._id < resume_id) for account in progress(fetch_things2(msg_accounts), estimate=resume_id): current_inbox_count = account.inbox_count unread_messages = list(queries.get_unread_inbox(account)) if account._id % 100000 == 0: g.reset_caches() if not len(unread_messages): if current_inbox_count: account._incr('inbox_count', -current_inbox_count) else: msgs = Message._by_fullname( unread_messages, data=True, return_dict=False, ignore_missing=True,
def shorten_byurl_keys(): """We changed by_url keys from a format like byurl_google.com... to: byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...) so that they would fit in memcache's 251-char limit """ from datetime import datetime from hashlib import md5 from r2.models import Link from r2.lib.filters import _force_utf8 from pylons import g from r2.lib.utils import fetch_things2, in_chunks from r2.lib.db.operators import desc from r2.lib.utils import base_url, progress # from link.py def old_by_url_key(url): prefix='byurl_' s = _force_utf8(base_url(url.lower())) return '%s%s' % (prefix, s) def new_by_url_key(url): maxlen = 250 template = 'byurl(%s,%s)' keyurl = _force_utf8(base_url(url.lower())) hexdigest = md5(keyurl).hexdigest() usable_len = maxlen-len(template)-len(hexdigest) return template % (hexdigest, keyurl[:usable_len]) verbosity = 1000 l_q = Link._query( Link.c._spam == (True, False), data=True, sort=desc('_date')) for links in ( in_chunks( progress( fetch_things2(l_q, verbosity), key = lambda link: link._date, verbosity=verbosity, estimate=int(9.9e6), persec=True, ), verbosity)): # only links with actual URLs links = filter(lambda link: (not getattr(link, 'is_self', False) and getattr(link, 'url', '')), links) # old key -> new key translate = dict((old_by_url_key(link.url), new_by_url_key(link.url)) for link in links) old = g.permacache.get_multi(translate.keys()) new = dict((translate[old_key], value) for (old_key, value) in old.iteritems()) g.permacache.set_multi(new)
# The Original Developer is the Initial Developer. The Initial Developer of # the Original Code is reddit Inc. # # All portions of the code written by reddit are Copyright (c) 2006-2013 reddit # Inc. All Rights Reserved. ############################################################################### """Ensure modmsgtime is properly set on all accounts. See the comment in Account.is_moderator_somewhere for possible values of this attribute now. """ from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2, progress from r2.models import Account, Subreddit all_accounts = Account._query(sort=desc("_date")) for account in progress(fetch_things2(all_accounts)): is_moderator_somewhere = bool(Subreddit.reverse_moderator_ids(account)) if is_moderator_somewhere: if not account.modmsgtime: account.modmsgtime = False else: # the account already has a date for modmsgtime meaning unread mail pass else: account.modmsgtime = None account._commit()