def subverbify_sitemaps(subverbifys): """Create an array of sitemaps. Each sitemap has up to 50000 links, being the maximum allowable number of links according to the sitemap standard. """ for subverbify_chunks in in_chunks(subverbifys, LINKS_PER_SITEMAP): yield _subverbify_sitemap(subverbify_chunks)
def simple_get_multi(self, keys): results = {} category_bundles = {} for key in keys: category, ids = self._split_key(key) category_bundles.setdefault(category, []).append(ids) for category in category_bundles: idses = category_bundles[category] chunks = in_chunks(idses, size=50) for chunk in chunks: new_results = self.backend.get_multi(category, chunk) results.update(new_results) return results
def port_deleted_links(after_id=None): from v1.models import Link from v1.lib.db.operators import desc from v1.models.query_cache import CachedQueryMutator from v1.lib.db.queries import get_deleted_links from v1.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc('_date'), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def port_cassaurls(after_id=None, estimate=15231317): from v1.models import Link, LinksByUrlAndSubverbify from v1.lib.db import tdb_cassandra from v1.lib.db.operators import desc from v1.lib.db.tdb_cassandra import CL from v1.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubverbify.add_link(l)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): doc_api = getattr(g, doc_api) uploader = uploader(doc_api) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = v1utils.fetch_things2(q, chunk_size=chunk_size) q = v1utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in v1utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def _location_by_ips(ips): if not hasattr(g, 'geoip_location'): g.log.warning("g.geoip_location not set. skipping GeoIP lookup.") return {} ret = {} for batch in in_chunks(ips, MAX_IPS_PER_GROUP): ip_string = '+'.join(batch) url = os.path.join(g.geoip_location, 'geoip', ip_string) try: response = urllib2.urlopen(url=url, timeout=3) json_data = response.read() except (urllib2.URLError, httplib.HTTPException, socket.error) as e: g.log.warning("Failed to fetch GeoIP information: %r" % e) continue try: ret.update(json.loads(json_data)) except ValueError, e: g.log.warning("Invalid JSON response for GeoIP lookup: %r" % e) continue
def _populate(after_id=None, estimate=54301242): from v1.models import desc from v1.lib.db import tdb_cassandra from v1.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc('_date')) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, 'link_id'), chunk) add_comments(chunk)
def backfill_vote_details(cls): ninety_days = timedelta(days=90).total_seconds() for chunk in in_chunks(cls._all(), size=100): detail_chunk = defaultdict(dict) try: with VoterIPByThing._cf.batch(write_consistency_level=cls._write_consistency_level) as b: for vote_list in chunk: thing_id36 = vote_list._id thing_fullname = vote_list.votee_fullname details = vote_list.decode_details() for detail in details: voter_id36 = detail["voter_id"] if "ip" in detail and detail["ip"]: ip = detail["ip"] redacted = dict(detail) del redacted["ip"] cast = detail["date"] now = epoch_seconds(datetime.utcnow().replace(tzinfo=g.tz)) ttl = ninety_days - (now - cast) oneweek = "" if ttl < 3600 * 24 * 7: oneweek = "(<= one week left)" print "Inserting %s with IP ttl %d %s" % (redacted, ttl, oneweek) detail_chunk[thing_id36][voter_id36] = json.dumps(redacted) if ttl <= 0: print "Skipping bogus ttl for %s: %d" % (redacted, ttl) continue b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl) except Exception: # Getting some really weird spurious errors here; complaints about negative # TTLs even though they can't possibly be negative, errors from cass # that have an explanation of "(why=')" # Just going to brute-force this through. We might lose 100 here and there # but mostly it'll be intact. pass for votee_id36, valuedict in detail_chunk.iteritems(): cls._set_values(votee_id36, valuedict)
def pushup_permacache(verbosity=1000): """When putting cassandra into the permacache chain, we need to push everything up into the rest of the chain, so this is everything that uses the permacache, as of that check-in.""" from pylons import app_globals as g from v1.models import Link, Subverbify, Account from v1.lib.db.operators import desc from v1.lib.comment_tree import comments_key, messages_key from v1.lib.utils import fetch_things2, in_chunks from v1.lib.utils import last_modified_key from v1.lib.promote import promoted_memo_key from v1.lib.subverbify_search import load_all_verbifys from v1.lib.db import queries from v1.lib.cache import CassandraCacheChain authority = g.permacache.caches[-1] nonauthority = CassandraCacheChain(g.permacache.caches[1:-1]) def populate(keys): vals = authority.simple_get_multi(keys) if vals: nonauthority.set_multi(vals) def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_verbifys() yield queries.get_all_comments().iden l_q = Link._query( Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query( Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subverbify._query( Subverbify.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subverbify_messages(sr).iden yield queries.get_unread_subverbify_messages(sr).iden done = 0 for keys in in_chunks(gen_keys(), verbosity): g.reset_caches() done += len(keys) print 'Done %d: %r' % (done, keys[-1]) populate(keys)