示例#1
0
def write_permacache_from_dir(dirname):
    # we want the whole list so that we can display accurate progress
    # information. If we're operating on more than tens of millions of
    # files, we should either bail out or tweak this to not need the
    # whole list at once
    allfiles = []
    for root, dirs, files in os.walk(dirname):
        for f in files:
            allfiles.append(os.path.join(root, f))

    for fname in progress(allfiles, persec=True):
        try:
            write_permacache_from_file(fname)
            os.unlink(fname)
        except:
            mr_tools.status('failed on %r' % fname)
            raise

    mr_tools.status('Removing empty directories')
    for root, dirs, files in os.walk(dirname, topdown=False):
        for d in dirs:
            dname = os.path.join(root, d)
            try:
                os.rmdir(dname)
            except OSError as e:
                if e.errno == errno.ENOTEMPTY:
                    mr_tools.status('%s not empty' % (dname, ))
                else:
                    raise
示例#2
0
def backfill_deleted_accounts(resume_id=None):
    del_accts = Account._query(Account.c._deleted == True, sort=desc('_date'))
    if resume_id:
        del_accts._filter(Account.c._id < resume_id)

    for i, account in enumerate(progress(fetch_things2(del_accts))):
        # Don't kill the rabbit! Wait for the relevant queues to calm down.
        if i % 1000 == 0:
            del_len = get_queue_length('del_account_q')
            cs_len = get_queue_length('cloudsearch_changes')
            while (del_len > 1000 or
                    cs_len > 10000):
                sys.stderr.write(("CS: %d, DEL: %d" % (cs_len, del_len)) + "\n")
                sys.stderr.flush()
                time.sleep(1)
                del_len = get_queue_length('del_account_q')
                cs_len = get_queue_length('cloudsearch_changes')
        amqp.add_item('account_deleted', account._fullname)
示例#3
0
def port_deleted_links(after_id=None):
    from v1.models import Link
    from v1.lib.db.operators import desc
    from v1.models.query_cache import CachedQueryMutator
    from v1.lib.db.queries import get_deleted_links
    from v1.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._deleted == True,
                    Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, verbosity=1000)

    for chunk in in_chunks(q):
        with CachedQueryMutator() as m:
            for link in chunk:
                query = get_deleted_links(link.author_id)
                m.insert(query, [link])
示例#4
0
def port_cassaurls(after_id=None, estimate=15231317):
    from v1.models import Link, LinksByUrlAndSubverbify
    from v1.lib.db import tdb_cassandra
    from v1.lib.db.operators import desc
    from v1.lib.db.tdb_cassandra import CL
    from v1.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    if after_id:
        q._after(Link._byID(after_id, data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q if getattr(l, 'url', 'self') != 'self'
         and not getattr(l, 'is_self', False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        for l in chunk:
            LinksByUrlAndSubverbify.add_link(l)
示例#5
0
def rebuild_link_index(start_at=None,
                       sleeptime=1,
                       cls=Link,
                       uploader=LinkUploader,
                       doc_api='CLOUDSEARCH_DOC_API',
                       estimate=50000000,
                       chunk_size=1000):
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'))

    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)

    q = v1utils.fetch_things2(q, chunk_size=chunk_size)
    q = v1utils.progress(q,
                         verbosity=1000,
                         estimate=estimate,
                         persec=True,
                         key=_progress_key)
    for chunk in v1utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        print "last updated %s" % last_update._fullname
        time.sleep(sleeptime)
示例#6
0
def _populate(after_id=None, estimate=54301242):
    from v1.models import desc
    from v1.lib.db import tdb_cassandra
    from v1.lib import utils

    # larger has a chance to decrease the number of Cassandra writes,
    # but the probability is low
    chunk_size = 5000

    q = Comment._query(Comment.c._spam == (True, False),
                       Comment.c._deleted == (True, False),
                       sort=desc('_date'))

    if after_id is not None:
        q._after(Comment._byID(after_id))

    q = utils.fetch_things2(q, chunk_size=chunk_size)
    q = utils.progress(q, verbosity=chunk_size, estimate=estimate)

    for chunk in utils.in_chunks(q, chunk_size):
        chunk = filter(lambda x: hasattr(x, 'link_id'), chunk)
        add_comments(chunk)
示例#7
0
#
# The Original Developer is the Initial Developer.  The Initial Developer of
# the Original Code is verbify Inc.
#
# All portions of the code written by verbify are Copyright (c) 2006-2015 verbify
# Inc. All Rights Reserved.
###############################################################################
"""Ensure modmsgtime is properly set on all accounts.

See the comment in Account.is_moderator_somewhere for possible values of this
attribute now.

"""

from v1.lib.db.operators import desc
from v1.lib.utils import fetch_things2, progress
from v1.models import Account, Subverbify

all_accounts = Account._query(sort=desc("_date"))
for account in progress(fetch_things2(all_accounts)):
    is_moderator_somewhere = bool(Subverbify.reverse_moderator_ids(account))
    if is_moderator_somewhere:
        if not account.modmsgtime:
            account.modmsgtime = False
        else:
            # the account already has a date for modmsgtime meaning unread mail
            pass
    else:
        account.modmsgtime = None
    account._commit()
示例#8
0
        return False

    # don't show user their own unread stuff
    if msg.author_id == account._id:
        return False

    return True

resume_id = long(sys.argv[1]) if len(sys.argv) > 1 else None

msg_accounts = Account._query(sort=desc("_date"), data=True)

if resume_id:
    msg_accounts._filter(Account.c._id < resume_id)

for account in progress(fetch_things2(msg_accounts), estimate=resume_id):
    current_inbox_count = account.inbox_count
    unread_messages = list(queries.get_unread_inbox(account))

    if account._id % 100000 == 0:
        g.reset_caches()

    if not len(unread_messages):
        if current_inbox_count:
            account._incr('inbox_count', -current_inbox_count)
    else:
        msgs = Message._by_fullname(
            unread_messages,
            data=True,
            return_dict=False,
            ignore_missing=True,