def s3_list_downloads(raven_client): files = {"full": [], "diff1": [], "diff2": []} if not settings("asset_bucket"): return files asset_url = settings("asset_url") if not asset_url.endswith("/"): asset_url = asset_url + "/" diff = [] full = [] try: s3 = boto3.resource("s3") bucket = s3.Bucket(settings("asset_bucket")) for obj in bucket.objects.filter(Prefix="export/"): name = obj.key.split("/")[-1] path = urlparse.urljoin(asset_url, obj.key) # round to kilobyte size = int(round(obj.size / 1024.0, 0)) file = dict(name=name, path=path, size=size) if "diff-" in name: diff.append(file) elif "full-" in name: full.append(file) except (Boto3Error, BotoCoreError): raven_client.captureException() return files half = len(diff) // 2 + len(diff) % 2 diff = list(sorted(diff, key=itemgetter("name"), reverse=True)) files["diff1"] = diff[:half] files["diff2"] = diff[half:] files["full"] = list(sorted(full, key=itemgetter("name"), reverse=True)) return files
def map_view(self): map_tiles_url = get_map_tiles_url(settings("asset_url")) return { "page_title": "Map", "map_enabled": self.is_map_enabled(), "map_tiles_url": map_tiles_url, "map_token": settings("mapbox_token"), }
def homepage_view(self): map_tiles_url = get_map_tiles_url(settings("asset_url")) image_base_url = HOMEPAGE_MAP_IMAGE.format(token=settings("mapbox_token")) image_url = map_tiles_url.format(z=0, x=0, y="0@2x") return { "page_title": "Overview", "map_enabled": self.is_map_enabled(), "map_image_base_url": image_base_url, "map_image_url": image_url, }
def configure_redis(cache_url=None, _client=None): """ Configure and return a :class:`~ichnaea.cache.RedisClient` instance. :param _client: Test-only hook to provide a pre-configured client. """ cache_url = settings("redis_uri") if cache_url is None else cache_url if _client is not None: return _client url = urlparse(cache_url) netloc = url.netloc.split(":") host = netloc[0] if len(netloc) > 1: port = int(netloc[1]) else: port = 6379 if len(url.path) > 1: db = int(url.path[1:]) else: db = 0 pool = redis.ConnectionPool( max_connections=20, host=host, port=port, db=db, socket_timeout=30.0, socket_connect_timeout=60.0, socket_keepalive=True, ) return RedisClient(connection_pool=pool)
def __call__(self, hourly=True, _bucket=None): if _bucket is None: bucket = settings("asset_bucket") else: bucket = _bucket if not bucket: return now = util.utcnow() today = now.date() start_time = None end_time = None if hourly: end_time = now.replace(minute=0, second=0) file_time = end_time file_type = "diff" start_time = end_time - timedelta(hours=1) else: file_time = now.replace(hour=0, minute=0, second=0) file_type = "full" filename = "MLS-%s-cell-export-" % file_type filename = filename + file_time.strftime("%Y-%m-%dT%H0000.csv.gz") with util.selfdestruct_tempdir() as temp_dir: path = os.path.join(temp_dir, filename) with self.task.db_session(commit=False) as session: write_stations_to_csv(session, path, today, start_time=start_time, end_time=end_time) self.write_stations_to_s3(path, bucket)
def main(argv, _db=None): parser = argparse.ArgumentParser( prog=argv[0], description=( "Import from public cell data into a local dev environment. " "See https://location.services.mozilla.com/downloads"), ) parser.add_argument("filename", help="Path to the csv.gz import file.") args = parser.parse_args(argv[1:]) if not settings("local_dev_env"): print("This script can only be run in a local dev environment.") print("Set LOCAL_DEV_ENV=True in your environment.") return 1 filename = os.path.abspath(os.path.expanduser(args.filename)) if not os.path.isfile(filename): print("File %s not found." % filename) return 1 configure_logging() celery_app = get_eager_celery_app() init_worker(celery_app) cellarea_queue = celery_app.data_queues["update_cellarea"] with db_worker_session(celery_app.db, commit=False) as session: with gzip_open(filename, "r") as file_handle: read_stations_from_csv(session, file_handle, celery_app.redis_client, cellarea_queue) return 0
def is_map_enabled(self): """Return whether maps are enabled. Enable maps if and only if there's a mapbox token and a url for the tiles location. Otherwise it's disabled. """ return bool(settings("mapbox_token"))
def main(argv, _raven_client=None, _bucketname=None): # run for example via: # bin/location_map --create --upload \ # --output=ichnaea/content/static/tiles/ parser = argparse.ArgumentParser( prog=argv[0], description="Generate and upload datamap tiles.") parser.add_argument("--create", action="store_true", help="Create tiles?") parser.add_argument("--upload", action="store_true", help="Upload tiles to S3?") parser.add_argument("--concurrency", default=2, help="How many concurrent processes to use?") parser.add_argument("--output", help="Optional directory for output files.") args = parser.parse_args(argv[1:]) if args.create: raven_client = configure_raven(transport="sync", tags={"app": "datamap"}, _client=_raven_client) configure_stats() bucketname = _bucketname if not _bucketname: bucketname = settings("asset_bucket") if bucketname: bucketname = bucketname.strip("/") upload = False if args.upload: upload = bool(args.upload) concurrency = billiard.cpu_count() if args.concurrency: concurrency = int(args.concurrency) output = None if args.output: output = os.path.abspath(args.output) try: with METRICS.timer("datamaps", tags=["func:main"]): generate( bucketname, raven_client, upload=upload, concurrency=concurrency, output=output, ) except Exception: raven_client.captureException() raise else: parser.print_help()
def security_headers(event): response = event.response # Headers for all responses. response.headers.add("Strict-Transport-Security", "max-age=31536000; includeSubDomains") response.headers.add("X-Content-Type-Options", "nosniff") # Headers for HTML responses. if response.content_type == "text/html": response.headers.add("Content-Security-Policy", get_csp_policy(settings("asset_url"))) response.headers.add("X-Frame-Options", "DENY") response.headers.add("X-XSS-Protection", "1; mode=block")
def configure_stats(): """Configure Markus for metrics.""" local_dev_env = settings("local_dev_env") if local_dev_env: markus.configure( backends=[{ "class": "markus.backends.logging.LoggingMetrics" }]) return if settings("statsd_host"): markus.configure(backends=[{ "class": "markus.backends.datadog.DatadogMetrics", "options": { "statsd_host": settings("statsd_host"), "statsd_port": settings("statsd_port"), "statsd_namespace": "location", }, }]) else: logging.getLogger(__name__).warning( "STATSD_HOST not set; no statsd configured")
def generate_signature(reason, *parts): """ Generate a salted signature for a set of strings. :arg reason A short "why" string used to salt the hash :arg parts A list of strings to add to the signature """ siggen = sha512() for part in parts: if part: siggen.update(part.encode()) siggen.update(reason.encode()) siggen.update(settings("secret_key").encode()) return siggen.hexdigest()
def __call__(self, *args, **kw): """ Execute the task, capture a statsd timer for the task duration and automatically report exceptions into Sentry. """ with METRICS.timer("task", tags=["task:" + self.shortname()]): try: result = super(BaseTask, self).__call__(*args, **kw) except Exception as exc: self.raven_client.captureException() if self._auto_retry and not settings("testing"): raise self.retry(exc=exc) raise return result
def configure_geoip(filename=None, mode=MODE_AUTO, raven_client=None, _client=None): """ Configure and return a :class:`~ichnaea.geoip.GeoIPWrapper` instance. If no geoip database file of the correct type can be found, return a :class:`~ichnaea.geoip.GeoIPNull` dummy implementation instead. :param raven_client: A configured raven/sentry client. :type raven_client: :class:`raven.base.Client` :param _client: Test-only hook to provide a pre-configured client. """ filename = settings("geoip_path") if filename is None else filename if _client is not None: return _client if not filename: # No DB file specified in the config if raven_client is not None: try: raise OSError("No geoip filename specified.") except OSError: raven_client.captureException() LOGGER.info("Returning GeoIPNull.") return GeoIPNull() try: db = GeoIPWrapper(filename, mode=mode) if not db.check_extension() and raven_client is not None: try: raise RuntimeError("Maxmind C extension not installed.") except RuntimeError: raven_client.captureException() # Actually initialize the memory cache, by doing one fake look-up db.lookup("127.0.0.1") except (InvalidDatabaseError, IOError, OSError, ValueError): # Error opening the database file, maybe it doesn't exist if raven_client is not None: raven_client.captureException() LOGGER.info("Returning GeoIPNull.") return GeoIPNull() LOGGER.info("GeoIP configured.") return db
def cmd_clitest(ctx): """Run Sentry test through cli.""" sentry_dsn = settings("sentry_dsn") if not sentry_dsn: click.echo( click.style( "SENTRY_DSN is not configured so this will use DebugRavenClient.", fg="green", )) msg = "Testing Sentry configuration via cli (%s)" % str( datetime.datetime.now()) click.echo(click.style("Using message: %s" % msg, fg="green")) click.echo(click.style("Building Raven client...", fg="green")) client = configure_raven(transport="sync", tags={"app": "sentry_test"}) click.echo(click.style("Sending message...", fg="green")) client.captureMessage(msg)
def apply(self, *args, **kw): """ This method is only used when calling tasks directly and blocking on them. It's also used if always_eager is set, like in tests. If always_eager is set, we feed the task arguments through the de/serialization process to make sure the arguments can indeed be serialized into JSON. """ if settings("testing"): # We do the extra check to make sure this was really used from # inside tests serializer = self.app.conf.task_serializer content_type, encoding, data = kombu_dumps(args, serializer) args = kombu_loads(data, content_type, encoding) return super(BaseTask, self).apply(*args, **kw)
def configure_raven(transport=None, _client=None): """Configure and return a :class:`raven.Client` instance. :param transport: The transport to use, one of the :data:`RAVEN_TRANSPORTS` keys. :param _client: Test-only hook to provide a pre-configured client. """ if _client is not None: return _client transport = RAVEN_TRANSPORTS.get(transport) if not transport: raise ValueError("No valid raven transport was configured.") dsn = settings("sentry_dsn") klass = DebugRavenClient if not dsn else RavenClient info = version_info() release = info.get("version") or info.get("commit") or "unknown" client = klass(dsn=dsn, transport=transport, release=release) return client
def map_json(self): map_tiles_url = get_map_tiles_url(settings("asset_url")) offset = map_tiles_url.find(TILES_PATTERN) base_url = map_tiles_url[:offset] return {"tiles_url": base_url}
def main(_argv=None, _raven_client=None, _bucket_name=None): """ Command-line entry point. :param _argv: Simulated sys.argv[1:] arguments for testing :param _raven_client: override Raven client for testing :param _bucket_name: override S3 bucket name for testing :return: A system exit code :rtype: int """ # Parse the command line parser = get_parser() args = parser.parse_args(_argv) create = args.create upload = args.upload concurrency = args.concurrency verbose = args.verbose # Setup basic services if verbose: configure_logging(local_dev_env=True, logging_level="DEBUG") else: configure_logging() raven_client = configure_raven( transport="sync", tags={"app": "datamap"}, _client=_raven_client ) # Check consistent output_dir, create, upload exit_early = 0 output_dir = None if args.output: output_dir = os.path.abspath(args.output) tiles_dir = os.path.join(output_dir, "tiles") if not create and not os.path.isdir(tiles_dir): LOG.error( "The tiles subfolder of the --output directory should already" " exist when calling --upload without --create, to avoid" " deleting files from the S3 bucket.", tiles_dir=tiles_dir, ) exit_early = 1 else: if create and not upload: LOG.error( "The --output argument is required with --create but without" " --upload, since the temporary folder is removed at exit." ) exit_early = 1 if upload and not create: LOG.error( "The --output argument is required with --upload but without" " --create, to avoid deleting all tiles in the S3 bucket." ) exit_early = 1 # Exit early with help message if error or nothing to do if exit_early or not (create or upload): parser.print_help() return exit_early # Determine the S3 bucket name bucket_name = _bucket_name if not _bucket_name: bucket_name = settings("asset_bucket") if bucket_name: bucket_name = bucket_name.strip("/") # Check that the implied credentials are authorized to use the bucket if upload: if not bucket_name: LOG.error("Unable to determine upload bucket_name.") return 1 else: works, fail_msg = check_bucket(bucket_name) if not works: LOG.error( f"Bucket {bucket_name} can not be used for uploads: {fail_msg}" ) return 1 # Generate and upload the tiles success = True interrupted = False result = {} try: with Timer() as timer: if output_dir: result = generate( output_dir, bucket_name, raven_client, create=create, upload=upload, concurrency=concurrency, ) else: with util.selfdestruct_tempdir() as temp_dir: result = generate( temp_dir, bucket_name, raven_client, create=create, upload=upload, concurrency=concurrency, ) except KeyboardInterrupt: interrupted = True success = False except Exception: raven_client.captureException() success = False raise finally: if create and upload: task = "generation and upload" elif create: task = "generation" else: task = "upload" if interrupted: complete = "interrupted" elif success: complete = "complete" else: complete = "failed" final_log = structlog.get_logger("canonical-log-line") final_log.info( f"Datamap tile {task} {complete} in {timer.duration_s:0.1f} seconds.", success=success, duration_s=timer.duration_s, script_name="ichnaea.scripts.datamap", create=create, upload=upload, concurrency=concurrency, bucket_name=bucket_name, **result, ) return 0
def _map_content_enabled(): return bool(settings("mapbox_token"))
def configure_logging(): """Configure Python logging.""" local_dev_env = settings("local_dev_env") logging_level = settings("logging_level") if local_dev_env: handlers = ["dev"] # Prepare structlog logs for local dev ProcessorFormatter structlog_fmt_prep = structlog.stdlib.ProcessorFormatter.wrap_for_formatter structlog_dev_processors = [ structlog.stdlib.add_logger_name, structlog.stdlib.add_log_level, structlog.processors.TimeStamper(fmt="iso"), ] else: handlers = ["mozlog"] # Prepare structlog logs for JsonLogFormatter structlog_fmt_prep = structlog.stdlib.render_to_log_kwargs structlog_dev_processors = [] # Processors used for logs generated by structlog and stdlib's logging logging_config = { "version": 1, "disable_existing_loggers": True, "formatters": { "structlog_dev_console": { "()": structlog.stdlib.ProcessorFormatter, "processor": structlog.dev.ConsoleRenderer(colors=True), "foreign_pre_chain": structlog_dev_processors, }, "mozlog_json": { "()": "dockerflow.logging.JsonLogFormatter", "logger_name": "ichnaea", }, }, "handlers": { "dev": { "class": "logging.StreamHandler", "formatter": "structlog_dev_console", "level": "DEBUG", }, "mozlog": { "class": "logging.StreamHandler", "formatter": "mozlog_json", "level": "DEBUG", }, }, "loggers": { "alembic": { "propagate": False, "handlers": handlers, "level": logging_level, }, "celery": { "propagate": False, "handlers": handlers, "level": logging_level, }, "ichnaea": { "propagate": False, "handlers": handlers, "level": logging_level, }, "markus": { "propagate": False, "handlers": handlers, "level": logging_level, }, # https://stripe.com/blog/canonical-log-lines "canonical-log-line": { "propagate": False, "handlers": handlers, "level": logging_level, }, }, "root": { "handlers": handlers, "level": "WARNING" }, } logging.config.dictConfig(logging_config) structlog_processors = ([ structlog.threadlocal.merge_threadlocal, structlog.stdlib.filter_by_level ] + structlog_dev_processors + [ structlog.stdlib.PositionalArgumentsFormatter(), structlog.processors.StackInfoRenderer(), structlog.processors.format_exc_info, structlog.processors.UnicodeDecoder(), structlog_fmt_prep, ]) structlog.configure( context_class=structlog.threadlocal.wrap_dict(dict), processors=structlog_processors, logger_factory=structlog.stdlib.LoggerFactory( ignore_frame_names=["venusian", "pyramid.config"]), wrapper_class=structlog.stdlib.BoundLogger, cache_logger_on_first_use=True, )
def log_tween_factory(handler, registry): """A logging tween, handling collection of stats, exceptions, and a request log.""" local_dev_env = settings("local_dev_env") def log_tween(request): """Time a request, emit metrics and log results, with exception handling.""" start = time.time() structlog.threadlocal.clear_threadlocal() structlog.threadlocal.bind_threadlocal(http_method=request.method, http_path=request.path) # Skip detailed logging and capturing for static assets, either in # /static or paths like /robots.txt is_static_content = (request.path in registry.skip_logging or request.path.startswith("/static")) def record_response(status_code): """Time request, (maybe) emit metrics, and (maybe) log this request. For static assets, metrics are skipped, and logs are skipped unless we're in the development environment. """ duration = time.time() - start if not is_static_content: # Emit a request.timing and a request metric duration_ms = round(duration * 1000) # Convert a URI to to a statsd acceptable metric stats_path = (request.path.replace("/", ".").lstrip(".").replace( "@", "-")) # Use generate_tag to lowercase, truncate to 200 characters statsd_tags = [ # Homepage is ".homepage", would otherwise be empty string / True generate_tag("path", stats_path or ".homepage"), generate_tag("method", request.method), # GET -> get, POST -> post ] METRICS.timing("request.timing", duration_ms, tags=statsd_tags) METRICS.incr( "request", tags=statsd_tags + [generate_tag("status", str(status_code))], ) if local_dev_env or not is_static_content: # Emit a canonical-log-line duration_s = round(duration, 3) logger = structlog.get_logger("canonical-log-line") logger.info( f"{request.method} {request.path} - {status_code}", http_status=status_code, duration_s=duration_s, ) try: response = handler(request) record_response(response.status_code) return response except (BaseClientError, HTTPRedirection) as exc: # BaseClientError: 4xx error raise by Ichnaea API, other Ichnaea code # HTTPRedirection: 3xx redirect from Pyramid # Log, but do not send these exceptions to Sentry record_response(exc.status_code) raise except HTTPClientError: # HTTPClientError: 4xx error from Pyramid # Do not log or send to Sentry raise except HTTPException as exc: # HTTPException: Remaining 5xx (or maybe 2xx) errors from Pyramid # Log and send to Sentry record_response(exc.status_code) registry.raven_client.captureException() raise except Exception: # Any other exception, treat as 500 Internal Server Error # Treat as 500 Internal Server Error, log and send to Sentry record_response(500) registry.raven_client.captureException() raise return log_tween
import backoff import markus from pymysql.constants.CLIENT import MULTI_STATEMENTS from pymysql.constants.ER import LOCK_WAIT_TIMEOUT, LOCK_DEADLOCK from pymysql.err import DatabaseError, MySQLError from sqlalchemy import create_engine from sqlalchemy.engine.url import make_url from sqlalchemy.exc import OperationalError, StatementError from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.pool import NullPool, QueuePool from sqlalchemy.sql import func, select from ichnaea.conf import settings DB_TYPE = { "ro": settings("db_readonly_uri"), "rw": settings("db_readwrite_uri") } METRICS = markus.get_metrics() class SqlAlchemyUrlNotSpecified(Exception): """Raised when SQLALCHEMY_URL is not specified in environment.""" def __init__(self, *args, **kwargs): super().__init__("SQLALCHEMY_URL is not specified in the environment") def get_sqlalchemy_url(): """Returns the ``SQLALCHEMY_URL`` environment value. :returns: the sqlalchemy url to be used for alembic migrations
def _cell_export_enabled(): return bool(settings("asset_bucket"))
def configure_logging(): """Configure Python logging.""" local_dev_env = settings("local_dev_env") logging_level = settings("logging_level") if local_dev_env: handlers = ["console"] else: handlers = ["mozlog"] logging_config = { "version": 1, "disable_existing_loggers": True, "formatters": { "app": { "format": "%(asctime)s %(levelname)-5s [%(name)s] - %(message)s" }, "json": { "()": "dockerflow.logging.JsonLogFormatter", "logger_name": "ichnaea", }, }, "handlers": { "console": { "class": "logging.StreamHandler", "formatter": "app", "level": "DEBUG", }, "mozlog": { "class": "logging.StreamHandler", "formatter": "json", "level": "DEBUG", }, }, "loggers": { "alembic": { "propagate": False, "handlers": handlers, "level": logging_level, }, "celery": { "propagate": False, "handlers": handlers, "level": logging_level, }, "ichnaea": { "propagate": False, "handlers": handlers, "level": logging_level, }, "markus": { "propagate": False, "handlers": handlers, "level": logging_level, }, }, "root": { "handlers": handlers, "level": "WARNING" }, } logging.config.dictConfig(logging_config)