def write_permacache_from_dir(dirname): # we want the whole list so that we can display accurate progress # information. If we're operating on more than tens of millions of # files, we should either bail out or tweak this to not need the # whole list at once allfiles = [] for root, dirs, files in os.walk(dirname): for f in files: allfiles.append(os.path.join(root, f)) for fname in progress(allfiles, persec=True): try: write_permacache_from_file(fname) os.unlink(fname) except: mr_tools.status('failed on %r' % fname) raise mr_tools.status('Removing empty directories') for root, dirs, files in os.walk(dirname, topdown=False): for d in dirs: dname = os.path.join(root, d) try: os.rmdir(dname) except OSError as e: if e.errno == errno.ENOTEMPTY: mr_tools.status('%s not empty' % (dname, )) else: raise
def backfill_deleted_accounts(resume_id=None): del_accts = Account._query(Account.c._deleted == True, sort=desc('_date')) if resume_id: del_accts._filter(Account.c._id < resume_id) for i, account in enumerate(progress(fetch_things2(del_accts))): # Don't kill the rabbit! Wait for the relevant queues to calm down. if i % 1000 == 0: del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') while (del_len > 1000 or cs_len > 10000): sys.stderr.write(("CS: %d, DEL: %d" % (cs_len, del_len)) + "\n") sys.stderr.flush() time.sleep(1) del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') amqp.add_item('account_deleted', account._fullname)
def port_deleted_links(after_id=None): from v1.models import Link from v1.lib.db.operators import desc from v1.models.query_cache import CachedQueryMutator from v1.lib.db.queries import get_deleted_links from v1.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc('_date'), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def port_cassaurls(after_id=None, estimate=15231317): from v1.models import Link, LinksByUrlAndSubverbify from v1.lib.db import tdb_cassandra from v1.lib.db.operators import desc from v1.lib.db.tdb_cassandra import CL from v1.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubverbify.add_link(l)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): doc_api = getattr(g, doc_api) uploader = uploader(doc_api) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = v1utils.fetch_things2(q, chunk_size=chunk_size) q = v1utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in v1utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def _populate(after_id=None, estimate=54301242): from v1.models import desc from v1.lib.db import tdb_cassandra from v1.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc('_date')) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, 'link_id'), chunk) add_comments(chunk)
# # The Original Developer is the Initial Developer. The Initial Developer of # the Original Code is verbify Inc. # # All portions of the code written by verbify are Copyright (c) 2006-2015 verbify # Inc. All Rights Reserved. ############################################################################### """Ensure modmsgtime is properly set on all accounts. See the comment in Account.is_moderator_somewhere for possible values of this attribute now. """ from v1.lib.db.operators import desc from v1.lib.utils import fetch_things2, progress from v1.models import Account, Subverbify all_accounts = Account._query(sort=desc("_date")) for account in progress(fetch_things2(all_accounts)): is_moderator_somewhere = bool(Subverbify.reverse_moderator_ids(account)) if is_moderator_somewhere: if not account.modmsgtime: account.modmsgtime = False else: # the account already has a date for modmsgtime meaning unread mail pass else: account.modmsgtime = None account._commit()
return False # don't show user their own unread stuff if msg.author_id == account._id: return False return True resume_id = long(sys.argv[1]) if len(sys.argv) > 1 else None msg_accounts = Account._query(sort=desc("_date"), data=True) if resume_id: msg_accounts._filter(Account.c._id < resume_id) for account in progress(fetch_things2(msg_accounts), estimate=resume_id): current_inbox_count = account.inbox_count unread_messages = list(queries.get_unread_inbox(account)) if account._id % 100000 == 0: g.reset_caches() if not len(unread_messages): if current_inbox_count: account._incr('inbox_count', -current_inbox_count) else: msgs = Message._by_fullname( unread_messages, data=True, return_dict=False, ignore_missing=True,