def setUpClass(cls): """Set up the database once for the test run.""" cls.sc = pyspark.SparkContext(master="local[*]") raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates)
def setup_module(): global aggregates global sc sc = pyspark.SparkContext(master="local[*]") raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(sc.parallelize(raw_pings)) submit_aggregates(aggregates)
def setup_module(): global aggregates global sc sc = pyspark.SparkContext(master="local[*]") raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates)
def setUpClass(cls): """Set up the database once for the test run.""" clear_db() cls.sc = pyspark.SparkContext(master="local[*]") raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates)
def test_submit(): # Multiple submissions should not alter the aggregates in the db submit_aggregates(aggregates) build_id_count, submission_date_count = submit_aggregates(aggregates) n_submission_dates = len(ping_dimensions["submission_date"]) n_channels = len(ping_dimensions["channel"]) n_versions = len(ping_dimensions["version"]) n_build_ids = len(ping_dimensions["build_id"]) assert(build_id_count == n_submission_dates*n_channels*n_versions*n_build_ids) assert(submission_date_count == n_submission_dates*n_channels*n_versions)
def test_submit(): # Multiple submissions should not alter the aggregates in the db submit_aggregates(aggregates) build_id_count, submission_date_count = submit_aggregates(aggregates) n_submission_dates = len(ping_dimensions["submission_date"]) n_channels = len(ping_dimensions["channel"]) n_versions = len(ping_dimensions["version"]) n_build_ids = len(ping_dimensions["build_id"]) assert (build_id_count == n_submission_dates * n_channels * n_versions * n_build_ids) assert (submission_date_count == n_submission_dates * n_channels * n_versions)
def test_null_label_character_submit(): metric_info = ("SIMPLE_MEASURES_NULL_METRIC_LABEL", u"\u0001\u0000\u0000\u0000\u7000\ub82c", False) payload = {"sum": 4, "count": 2, "histogram": {2: 2}} key = ('20161111', 'nightly', '52', '20161111', 'Firefox', 'arch', 'linux', '42', 'false') aggregate = (key, {metric_info: payload}) aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])] build_id_count, submission_date_count = submit_aggregates(aggregates) assert build_id_count == 0, "Build id count should be 0, was {}".format(build_id_count) assert submission_date_count == 0, "submission date count should be 0, was {}".format(build_id_count)
def test_null_arch_character_submit(): metric_info = ("SIMPLE_MEASURES_NULL_ARCHITECTURE", "", False) payload = {"sum": 4, "count": 2, "histogram": {2: 2}} key = ('20161111', 'nightly', '52', '20161111', '', "Firefox", u"\x00", 'Windows', '2.4.21') aggregate = (key, {metric_info: payload}) aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])] build_id_count, submission_date_count = submit_aggregates(aggregates) assert build_id_count == 0, "Build id count should be 0, was {}".format(build_id_count) assert submission_date_count == 0, "submission date count should be 0, was {}".format(build_id_count)
def test_null_arch_character_submit(): metric_info = ("SIMPLE_MEASURES_NULL_ARCHITECTURE", "", False) payload = {"sum": 4, "count": 2, "histogram": {2: 2}} key = ('20161111', 'nightly', '52', '20161111', '', "Firefox", u"\x00", 'Windows', '2.4.21') aggregate = (key, {metric_info: payload}) aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])] build_id_count, submission_date_count = submit_aggregates(aggregates) assert build_id_count == 0, "Build id count should be 0, was {}".format( build_id_count) assert submission_date_count == 0, "submission date count should be 0, was {}".format( build_id_count)
#!/mnt/anaconda2/bin/ipython import logging from os import environ from pyspark import SparkContext, SparkConf from mozaggregator.aggregator import aggregate_metrics from mozaggregator.db import submit_aggregates conf = SparkConf().setAppName('telemetry-aggregates') sc = SparkContext(conf=conf) date = environ['date'] print "Running job for {}".format(date) aggregates = aggregate_metrics(sc, ("nightly", "beta", "release"), date) print "Number of build-id aggregates: {}".format(aggregates[0].count()) print "Number of submission date aggregates: {}".format(aggregates[1].count()) submit_aggregates(aggregates) sc.stop()
def aggregates(sc): raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates) return aggregates
.get_bucket("telemetry-spark-emr-2") .get_key("aggregator_database_envvars.json") .get_contents_as_string() ) for k, v in creds.items(): environ[k] = v # Attempt a database connection now so we can fail fast if credentials are broken. _preparedb() # Send jobs to the spark workers. package_file = sys.argv[1] print "Adding dependency " + package_file conf = SparkConf().setAppName('telemetry-aggregates') sc = SparkContext(conf=conf) sc.addPyFile(package_file) date = environ['date'] channels = [c.strip() for c in environ['channels'].split(',')] print "Running job for {}".format(date) aggregates = aggregate_metrics(sc, channels, date) print "Number of build-id aggregates: {}".format(aggregates[0].count()) print "Number of submission date aggregates: {}".format(aggregates[1].count()) # Store the results in Postgres. submit_aggregates(aggregates) sc.stop()
def run_aggregator( date, channels, credentials_protocol, credentials_bucket, credentials_prefix, postgres_db, postgres_user, postgres_pass, postgres_host, postgres_ro_host, num_partitions, source, project_id, dataset_id, avro_prefix, ): spark = SparkSession.builder.getOrCreate() # Mozaggregator expects a series of POSTGRES_* variables in order to connect # to a db instance; we may pull them into the environment now by reading an # object from a file system. def create_path(protocol, bucket, prefix): mapping = {"file": "file", "s3": "s3a", "gcs": "gs"} return f"{mapping[protocol]}://{bucket}/{prefix}" # priority of reading credentials is options > credentials file > environment option_credentials = { "POSTGRES_DB": postgres_db, "POSTGRES_USER": postgres_user, "POSTGRES_PASS": postgres_pass, "POSTGRES_HOST": postgres_host, "POSTGRES_RO_HOST": postgres_ro_host, } if all(option_credentials.values()): print("reading credentials from options") environ.update(option_credentials) elif credentials_bucket and credentials_prefix: path = create_path(credentials_protocol, credentials_bucket, credentials_prefix) print(f"reading credentials from {path}") creds = spark.read.json(path, multiLine=True).first().asDict() environ.update(creds) else: print(f"assuming credentials from the environment") # Attempt a database connection now so we can fail fast if credentials are broken. db._preparedb() channels = [channel.strip() for channel in channels.split(",")] print(f"Running job for {date}") aggregates = aggregator.aggregate_metrics( spark.sparkContext, channels, date, num_reducers=num_partitions, source=source, project_id=project_id, dataset_id=dataset_id, avro_prefix=avro_prefix, ) aggregates[0].cache() aggregates[1].cache() print(f"Number of build-id aggregates: {aggregates[0].count()}") print(f"Number of submission date aggregates: {aggregates[1].count()}") # Store the results in Postgres. db.submit_aggregates(aggregates)