def init_cass_cache(self, cluster, caches, cassandra_seeds, memcached_kw={}, cassandra_kw={}): localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) pmc_chain = (localcache_cls(), ) # if caches, append if caches: pmc_chain += (CMemcache(caches, num_clients=self.num_mc_clients, **memcached_kw), ) # if seeds, append if cassandra_seeds: cassandra_seeds = list(cassandra_seeds) random.shuffle(cassandra_seeds) pmc_chain += (CassandraCache(cluster, cluster, cassandra_seeds, **cassandra_kw), ) mc = CassandraCacheChain(pmc_chain, cache_negative_results=True) else: mc = MemcacheChain(pmc_chain) self.cache_chains.append(mc) return mc
def init_cass_cache(self, keyspace, column_family, cassandra_client, lock_factory, memcache = None, read_consistency_level = CL_ONE, write_consistency_level = CL_ONE, localcache_cls = LocalCache): return CassandraCacheChain(localcache_cls(), CassandraCache(keyspace, column_family, cassandra_client, read_consistency_level = read_consistency_level, write_consistency_level = write_consistency_level), memcache = memcache, lock_factory = lock_factory)
def setup(self): self.queues = queues.declare_queues(self) ################# PROVIDERS self.media_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.media", self.media_provider, ) self.startup_timer.intermediate("providers") ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.trusted_domains = set([self.domain]) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} # make python warnings go through the logging system logging.captureWarnings(capture=True) log = logging.getLogger('reddit') # when we're a script (paster run) just set up super simple logging if self.running_as_script: log.setLevel(logging.INFO) log.addHandler(logging.StreamHandler()) # if in debug mode, override the logging level to DEBUG if self.debug: log.setLevel(logging.DEBUG) # attempt to figure out which pool we're in and add that to the # LogRecords. try: with open("/etc/ec2_asg", "r") as f: pool = f.read().strip() # clean up the pool name since we're putting stuff after "-" pool = pool.partition("-")[0] except IOError: pool = "reddit-app" self.log = logging.LoggerAdapter(log, {"pool": pool}) # set locations locations = pkg_resources.resource_stream(__name__, "../data/locations.json") self.locations = json.loads(locations.read()) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print >> sys.stderr, ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") if self.oauth_domain == self.domain: print >> sys.stderr, ("Warning: g.oauth_domain == g.domain. " "CORS requests to g.domain will be allowed") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) locale.setlocale(locale.LC_ALL, self.locale) # Pre-calculate ratelimit values self.RL_RESET_SECONDS = self.config["RL_RESET_MINUTES"] * 60 self.RL_MAX_REQS = int(self.config["RL_AVG_REQ_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_OAUTH_RESET_SECONDS = self.config["RL_OAUTH_RESET_MINUTES"] * 60 self.RL_OAUTH_MAX_REQS = int(self.config["RL_OAUTH_AVG_REQ_PER_SEC"] * self.RL_OAUTH_RESET_SECONDS) self.RL_LOGIN_MAX_REQS = int(self.config["RL_LOGIN_AVG_PER_SEC"] * self.RL_RESET_SECONDS) self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.secrets = fetch_secrets(self.zookeeper) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) # close our zk connection when the app shuts down SHUTDOWN_CALLBACKS.append(self.zookeeper.stop) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.optionxform = str parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.secrets = extract_secrets(parser) self.throttles = tuple() # immutable since it's not real self.startup_timer.intermediate("zookeeper") ################# PRIVILEGED USERS self.admins = PermissionFilteredEmployeeList( self.live_config, type="admin") self.sponsors = PermissionFilteredEmployeeList( self.live_config, type="sponsor") self.employees = PermissionFilteredEmployeeList( self.live_config, type="employee") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. memcache = CMemcache( self.memcaches, min_compress_len=1400, num_clients=num_mc_clients, binary=True, ) # a pool just used for @memoize results memoizecaches = CMemcache( self.memoizecaches, min_compress_len=50 * 1024, num_clients=num_mc_clients, binary=True, ) # a pool just for srmember rels srmembercaches = CMemcache( self.srmembercaches, min_compress_len=96, num_clients=num_mc_clients, binary=True, ) # a pool just for rels relcaches = CMemcache( self.relcaches, min_compress_len=96, num_clients=num_mc_clients, binary=True, ) ratelimitcaches = CMemcache( self.ratelimitcaches, min_compress_len=96, num_clients=num_mc_clients, ) # a smaller pool of caches used only for distributed locks. # TODO: move this to ZooKeeper self.lock_cache = CMemcache(self.lockcaches, binary=True, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. if self.permacache_memcaches: permacache_memcaches = CMemcache(self.permacache_memcaches, min_compress_len=50 * 1024, num_clients=num_mc_clients) else: permacache_memcaches = None # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache(self.stalecaches, binary=True, num_clients=num_mc_clients) else: stalecaches = None # rendercache holds rendered partial templates. rendercaches = CMemcache( self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients, min_compress_len=480, ) # pagecaches hold fully rendered pages pagecaches = CMemcache( self.pagecaches, noreply=True, no_block=True, num_clients=num_mc_clients, min_compress_len=1400, ) self.startup_timer.intermediate("memcache") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=4, max_retries=3, prefill=False ), } permacache_cf = CassandraCache( 'permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl ) self.startup_timer.intermediate("cassandra") ################# POSTGRES self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.cache = StaleCacheChain( localcache_cls(), stalecaches, memcache, ) else: self.cache = MemcacheChain((localcache_cls(), memcache)) cache_chains.update(cache=self.cache) if stalecaches: self.memoizecache = StaleCacheChain( localcache_cls(), stalecaches, memoizecaches, ) else: self.memoizecache = MemcacheChain( (localcache_cls(), memoizecaches)) cache_chains.update(memoizecache=self.memoizecache) if stalecaches: self.srmembercache = StaleCacheChain( localcache_cls(), stalecaches, srmembercaches, ) else: self.srmembercache = MemcacheChain( (localcache_cls(), srmembercaches)) cache_chains.update(srmembercache=self.srmembercache) if stalecaches: self.relcache = StaleCacheChain( localcache_cls(), stalecaches, relcaches, ) else: self.relcache = MemcacheChain( (localcache_cls(), relcaches)) cache_chains.update(relcache=self.relcache) self.ratelimitcache = MemcacheChain( (localcache_cls(), ratelimitcaches)) cache_chains.update(ratelimitcache=self.ratelimitcache) self.rendercache = MemcacheChain(( localcache_cls(), rendercaches, )) cache_chains.update(rendercache=self.rendercache) self.pagecache = MemcacheChain(( localcache_cls(), pagecaches, )) cache_chains.update(pagecache=self.pagecache) # the thing_cache is used in tdb_cassandra. self.thing_cache = CacheChain((localcache_cls(),)) cache_chains.update(thing_cache=self.thing_cache) self.permacache = CassandraCacheChain( localcache_cls(), permacache_cf, memcache=permacache_memcaches, lock_factory=self.make_lock, ) cache_chains.update(permacache=self.permacache) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), memcache, HardCache(self)), cache_negative_results=True, ) cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.cache_chains = cache_chains self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) self.startup_timer.intermediate("revisions")
def setup(self): self.queues = queues.declare_queues(self) ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it # XXX: get rid of these options. new query cache is always on. if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) if getattr(self, 'oauth_domain', None): self.secure_domains.add(self.oauth_domain) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.throttles = tuple() # immutable since it's not real self.startup_timer.intermediate("zookeeper") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) # a smaller pool of caches used only for distributed locks. # TODO: move this to ZooKeeper self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. if self.permacache_memcaches: permacache_memcaches = CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) else: permacache_memcaches = None # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache(self.stalecaches, num_clients=num_mc_clients) else: stalecaches = None # rendercache holds rendered partial templates as well as fully # cached pages. rendercaches = CMemcache( self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients, ) self.startup_timer.intermediate("memcache") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=2, max_retries=3, prefill=False ), } permacache_cf = CassandraCache( 'permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl ) self.startup_timer.intermediate("cassandra") ################# POSTGRES event.listens_for(engine.Engine, 'before_cursor_execute')( self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, 'after_cursor_execute')( self.stats.pg_after_cursor_execute) self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components self.cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.cache = StaleCacheChain( localcache_cls(), stalecaches, self.memcache, ) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain(( localcache_cls(), rendercaches, )) self.cache_chains.update(rendercache=self.rendercache) # the thing_cache is used in tdb_cassandra. self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.update(thing_cache=self.thing_cache) self.permacache = CassandraCacheChain( localcache_cls(), permacache_cf, memcache=permacache_memcaches, lock_factory=self.make_lock, ) self.cache_chains.update(permacache=self.permacache) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True, ) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) self.startup_timer.intermediate("revisions")
def pushup_permacache(verbosity=1000): """When putting cassandra into the permacache chain, we need to push everything up into the rest of the chain, so this is everything that uses the permacache, as of that check-in.""" from pylons import app_globals as g from r2.models import Link, Subreddit, Account from r2.lib.db.operators import desc from r2.lib.comment_tree import comments_key, messages_key from r2.lib.utils import fetch_things2, in_chunks from r2.lib.utils import last_modified_key from r2.lib.promote import promoted_memo_key from r2.lib.subreddit_search import load_all_reddits from r2.lib.db import queries from r2.lib.cache import CassandraCacheChain authority = g.permacache.caches[-1] nonauthority = CassandraCacheChain(g.permacache.caches[1:-1]) def populate(keys): vals = authority.simple_get_multi(keys) if vals: nonauthority.set_multi(vals) def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query( Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query( Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query( Subreddit.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden done = 0 for keys in in_chunks(gen_keys(), verbosity): g.reset_caches() done += len(keys) print 'Done %d: %r' % (done, keys[-1]) populate(keys)
def setup(self): # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = {} # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.throttles = tuple() # immutable since it's not real self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.stats = Stats(self.config.get('statsd_addr'), self.config.get('statsd_sample_rate')) event.listens_for(engine.Engine, 'before_cursor_execute')( self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, 'after_cursor_execute')( self.stats.pg_after_cursor_execute) self.make_lock = make_lock_factory(self.lock_cache, self.stats) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool(keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=2, max_retries=3, prefill=False), } perma_memcache = (CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain( localcache_cls(), CassandraCache('permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl), memcache=perma_memcache, lock_factory=self.make_lock) self.cache_chains.update(permacache=self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain( localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain( (localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients))) self.cache_chains.update(rendercache=self.rendercache) self.thing_cache = CacheChain((localcache_cls(), )) self.cache_chains.update(thing_cache=self.thing_cache) #load the database info self.dbm = self.load_db_params() # can't do this until load_db_params() has been called self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) if getattr(self, 'oauth_domain', None): self.secure_domains.add(self.oauth_domain) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) if self.log_start: self.log.error("reddit app %s:%s started %s at %s" % (self.reddit_host, self.reddit_pid, self.short_version, datetime.now()))
def __init__(self, global_conf, app_conf, paths, **extra): """ Globals acts as a container for objects available throughout the life of the application. One instance of Globals is created by Pylons during application initialization and is available during requests via the 'g' variable. ``global_conf`` The same variable used throughout ``config/middleware.py`` namely, the variables from the ``[DEFAULT]`` section of the configuration file. ``app_conf`` The same ``kw`` dictionary used throughout ``config/middleware.py`` namely, the variables from the section in the config file for your application. ``extra`` The configuration returned from ``load_config`` in ``config/middleware.py`` which may be of use in the setup of your global variables. """ # slop over all variables to start with for k, v in global_conf.iteritems(): if not k.startswith("_") and not hasattr(self, k): if k in self.int_props: v = int(v) elif k in self.float_props: v = float(v) elif k in self.bool_props: v = self.to_bool(v) elif k in self.tuple_props: v = tuple(self.to_iter(v)) elif k in self.choice_props: if v not in self.choice_props[k]: raise ValueError( "Unknown option for %r: %r not in %r" % (k, v, self.choice_props[k])) v = self.choice_props[k][v] setattr(self, k, v) self.running_as_script = global_conf.get('running_as_script', False) if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = [] self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.memcache) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") self.cassandra = PycassaConnectionPool( 'reddit', server_list=self.cassandra_seeds, pool_size=len(self.cassandra_seeds), # TODO: .ini setting timeout=15, max_retries=3, prefill=False) perma_memcache = (CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain( localcache_cls(), CassandraCache('permacache', self.cassandra, read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl), memcache=perma_memcache, lock_factory=self.make_lock) self.cache_chains.append(self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain( localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.append(self.cache) self.rendercache = MemcacheChain( (localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients))) self.cache_chains.append(self.rendercache) self.servicecache = MemcacheChain( (localcache_cls(), CMemcache(self.servicecaches, num_clients=num_mc_clients))) self.cache_chains.append(self.servicecache) self.thing_cache = CacheChain((localcache_cls(), )) self.cache_chains.append(self.thing_cache) # set default time zone if one is not set tz = global_conf.get('timezone') dtz = global_conf.get('display_timezone', tz) self.tz = pytz.timezone(tz) self.display_tz = pytz.timezone(dtz) #load the database info self.dbm = self.load_db_params(global_conf) # can't do this until load_db_params() has been called self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True) self.cache_chains.append(self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains[::] def reset_caches(): for chain in cache_chains: chain.reset() self.reset_caches = reset_caches self.reset_caches() #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) # turn on for language support self.languages, self.lang_name = \ get_active_langs(default_lang= self.lang) all_languages = self.lang_name.keys() all_languages.sort() self.all_languages = all_languages self.paths = paths # load the md5 hashes of files under static static_files = os.path.join(paths.get('static_files'), 'static') self.static_md5 = {} if os.path.exists(static_files): for f in os.listdir(static_files): if f.endswith('.md5'): key = f[0:-4] f = os.path.join(static_files, f) with open(f, 'r') as handle: md5 = handle.read().strip('\n') self.static_md5[key] = md5 #set up the logging directory log_path = self.log_path process_iden = global_conf.get('scgi_port', 'default') self.reddit_port = process_iden if log_path: if not os.path.exists(log_path): os.makedirs(log_path) for fname in os.listdir(log_path): if fname.startswith(process_iden): full_name = os.path.join(log_path, fname) os.remove(full_name) #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") #read in our CSS so that it can become a default for subreddit #stylesheets stylesheet_path = os.path.join(paths.get('static_files'), self.static_path.lstrip('/'), self.stylesheet) with open(stylesheet_path) as s: self.default_stylesheet = s.read() self.profanities = None if self.profanity_wordlist and os.path.exists(self.profanity_wordlist): with open(self.profanity_wordlist, 'r') as handle: words = [] for line in handle: words.append(line.strip(' \n\r')) if words: self.profanities = re.compile( r"\b(%s)\b" % '|'.join(words), re.I | re.U) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #the shutdown toggle self.shutdown = False #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision number try: popen = subprocess.Popen( ["git", "log", "--date=short", "--pretty=format:%H %h", '-n1'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) resp, stderrdata = popen.communicate() resp = resp.strip().split(' ') self.version, self.short_version = resp except object, e: self.log.info("Couldn't read source revision (%r)" % e) self.version = self.short_version = '(unknown)'
def setup(self, global_conf): # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = {} self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.memcache) self.stats = Stats(global_conf.get('statsd_addr'), global_conf.get('statsd_sample_rate')) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool(keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=len(self.cassandra_seeds), timeout=2, max_retries=3, prefill=False), "noretries": StatsCollectingConnectionPool(keyspace, stats=self.stats, logging_name="noretries", server_list=self.cassandra_seeds, pool_size=len(self.cassandra_seeds), timeout=.1, max_retries=0, prefill=False), } perma_memcache = (CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain( localcache_cls(), CassandraCache('permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl), memcache=perma_memcache, lock_factory=self.make_lock) self.cache_chains.update(permacache=self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain( localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain( (localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients))) self.cache_chains.update(rendercache=self.rendercache) self.servicecache = MemcacheChain( (localcache_cls(), CMemcache(self.servicecaches, num_clients=num_mc_clients))) self.cache_chains.update(servicecache=self.servicecache) self.thing_cache = CacheChain((localcache_cls(), )) self.cache_chains.update(thing_cache=self.thing_cache) #load the database info self.dbm = self.load_db_params(global_conf) # can't do this until load_db_params() has been called self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #the shutdown toggle self.shutdown = False #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision number try: self.version = subprocess.check_output( ["git", "rev-parse", "HEAD"]) except subprocess.CalledProcessError, e: self.log.info("Couldn't read source revision (%r)" % e) self.version = self.short_version = '(unknown)'