def update_published_v2_files(sdb, from_submission_date=None, to_submission_date=None, limit=None): s3 = S3Connection() bucket_name = "telemetry-published-v2" bucket = s3.get_bucket(bucket_name) schema_key = bucket.get_key("telemetry_schema.json") schema_string = schema_key.get_contents_as_string() schema = TelemetrySchema(json.loads(schema_string)) termination_requested = [False] def keyboard_interrupt_handler(signal, frame): termination_requested[0] = True signal.signal(signal.SIGINT, keyboard_interrupt_handler) added_count = 0 total_count = 0 start_time = datetime.now() done = False last_key = '' batch = BatchPut(sdb) while not done: try: for key in bucket.list(marker=last_key): last_key = key.name if total_count % 1e5 == 0: print("Looked at {} total records in {} seconds, added {}". format(total_count, delta_sec(start_time), added_count)) dims = schema.get_dimension_map(schema.get_dimensions(".", key.name)) if (from_submission_date is None or dims["submission_date"] >= from_submission_date) and \ (to_submission_date is None or dims["submission_date"] <= to_submission_date) and \ dims["submission_date"][:-2] in sdb and \ dims["reason"] != "idle_daily": attributes = {"reason": dims.get("reason"), "appName": dims.get("appName"), "appUpdateChannel": dims.get("appUpdateChannel"), "appVersion": dims.get("appVersion"), "appBuildID": dims.get("appBuildID"), "submissionDate": dims.get("submission_date")} batch.put(dims["submission_date"][:-2], key.name, attributes) added_count += 1 total_count += 1 if total_count == limit or termination_requested[0]: done = True break except Exception as e: print("Error listing keys: {}".format(e)) traceback.print_exc() print("Continuing from last seen key: {}".format(last_key)) continue break batch.flush() print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
def update_published_v4_files(sdb, bucket, bucket_prefix, submission_date, limit=None): s3 = S3Connection() metadata = s3.get_bucket(METADATA_BUCKET) schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix)) schema_string = schema_key.get_contents_as_string() schema = TelemetrySchema(json.loads(schema_string)) bucket = s3.get_bucket(bucket) added_count = 0 total_count = 0 start_time = datetime.now() done = False last_key = '' batch = BatchPut(sdb) prefix = "{}/{}".format(bucket_prefix, submission_date) if submission_date else bucket_prefix print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name, bucket_prefix, submission_date) while not done: try: for key in bucket.list(marker=last_key, prefix=prefix): last_key = key.name if total_count % 1e5 == 0: print("Looked at {} total records in {} seconds, added {}". format(total_count, delta_sec(start_time), added_count)) dims = schema.get_dimension_map(schema.get_dimensions(".", key.name[len(bucket_prefix) + 1:], dirs_only=True)) if (dims["submissionDate"] == submission_date) and dims["submissionDate"][:-2] in sdb: batch.put(dims["submissionDate"][:-2], key.name, dims) added_count += 1 total_count += 1 if total_count == limit: done = True break except Exception as e: print("Error listing keys: {}".format(e)) traceback.print_exc() print("Continuing from last seen key: {}".format(last_key)) continue break batch.flush() print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
def update_published_v4_files(sdb, bucket, bucket_prefix, submission_date, limit=None): conn = boto.connect_s3(host=S3_DEFAULT_ENDPOINT) metadata = conn.get_bucket(METADATA_BUCKET, validate=False) schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix)) schema_string = schema_key.get_contents_as_string() schema = TelemetrySchema(json.loads(schema_string)) bucket = conn.get_bucket(bucket, validate=False) added_count = 0 total_count = 0 start_time = datetime.now() done = False last_key = '' batch = BatchPut(sdb) prefix = "{}/{}".format( bucket_prefix, submission_date) if submission_date else bucket_prefix print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name, bucket_prefix, submission_date) while not done: try: for key in bucket.list(marker=last_key, prefix=prefix): last_key = key.name if total_count % 1e5 == 0: print("Looked at {} total records in {} seconds, added {}". format(total_count, delta_sec(start_time), added_count)) dims = schema.get_dimension_map( schema.get_dimensions(".", key.name[len(bucket_prefix) + 1:], dirs_only=True)) if (dims["submissionDate"] == submission_date ) and dims["submissionDate"][:-2] in sdb: batch.put(dims["submissionDate"][:-2], key.name, dims) added_count += 1 total_count += 1 if total_count == limit: done = True break except Exception as e: print("Error listing keys: {}".format(e)) traceback.print_exc() print("Continuing from last seen key: {}".format(last_key)) continue break batch.flush() print("Overall, added {} of {} in {} seconds".format( added_count, total_count, delta_sec(start_time)))
class AnalysisJob: def __init__(self, cfg): self.job_bundle = cfg.job_bundle if cfg.input_filter: self.input_filter = TelemetrySchema( json.load(open(cfg.input_filter))) else: self.input_filter = None if cfg.input_list_file: self.input_list = cfg.input_list_file else: self.input_list = None self.job_id = str(uuid4()) self.target_queue = cfg.target_queue self.aws_key = cfg.aws_key self.aws_secret_key = cfg.aws_secret_key self.input_bucket = "telemetry-published-v1" self.job_name = cfg.name self.job_owner = cfg.owner self.date_limit = cfg.date_limit # Bucket with intermediate data for this analysis job self.analysis_bucket = "jonasfj-telemetry-analysis" self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz" # S3 region of operation self.aws_region = "us-west-2" self.task_size_limit = 400 * 1024 * 1024 self.sqs_input_name = cfg.sqs_queue def get_filtered_files(self): conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) # date_limit is a hack that makes it easy to launch everything before # a given date... say to back process all we have in the bucket... if self.date_limit != None: print "Launching limiting to before " + self.date_limit for k, s in self.list_partitions(bucket): if k.split('/')[-1].split('.')[1] < self.date_limit: yield (k, s) else: for k, s in self.list_partitions(bucket): yield (k, s) def get_filtered_files_old(self): """ Get tuples of name and size for all input files """ # Setup some auxiliary functions allowed_values = self.input_filter.sanitize_allowed_values() nb_dims = len(allowed_values) def filter_includes(level, value): return self.input_filter.is_allowed(value, allowed_values[level]) # iterate over all files in bucket, this is very slow and we should be # be able to something much smarter using prefix listing and ordering # to break listing. count = 0 selected = 0 conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) for f in bucket.list(): count += 1 dims = self.input_filter.get_dimensions(".", f.key) include = True for i in xrange(nb_dims): if not filter_includes(i, dims[i]): include = False break if include: selected += 1 yield (f.key, f.size) if count % 5000 == 0: print "%i files listed with %i selected" % (count, selected) conn.close() def list_partitions(self, bucket, prefix='', level=0): if self.input_filter: #print "Listing...", prefix, level allowed_values = self.input_filter.sanitize_allowed_values() delimiter = '/' if level > 3: delimiter = '.' for k in bucket.list(prefix=prefix, delimiter=delimiter): partitions = k.name.split("/") if level > 3: # split the last couple of partition components by "." instead of "/" partitions.extend(partitions.pop().split(".", 2)) if self.input_filter.is_allowed(partitions[level], allowed_values[level]): if level >= 5: for f in bucket.list(prefix=k.name): yield (f.key, f.size) else: for k, s in self.list_partitions( bucket, k.name, level + 1): yield (k, s) elif self.input_list: print "Using input list..." for line in self.input_list: key_name = line.strip() k = bucket.get_key(key_name) yield (k.key, k.size) else: print "Don't know how to list partitions without a filter or list :(" raise ValueError("Missing both input_filter and input_list") def generate_tasks(self): """ Generates SQS tasks, we batch small files into a single task """ taskid = 1 taskfiles = [] tasksize = 0 total_size_of_all = 0 for key, size in self.get_filtered_files(): # If the task have reached desired size we yield it # Note, as SQS messages are limited to 65 KiB we limit tasks to # 100 filenames, for simplicity # boto only uses signature version 4, hence, we're limited to 65 KiB if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit or len(taskfiles) > 200): # Reduce to only filenames, sort by size... smallest first they are # faster to download when handling the job taskfiles = [ f for f, s in sorted(taskfiles, key=lambda (f, s): s) ] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } total_size_of_all += tasksize print "%i tasks created acc. size: %s" % (taskid, total_size_of_all) taskid += 1 taskfiles = [] tasksize = 0 tasksize += size taskfiles.append((key, size)) if len(taskfiles) > 0: taskfiles = [f for f, s in sorted(taskfiles, key=lambda (f, s): s)] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } print "Finished:" print "%i tasks created total size: %s" % (taskid, total_size_of_all + tasksize) def put_sqs_tasks(self): """ Create an SQS tasks for this analysis job """ print "Populate SQS input queue with tasks" # Connect to SQS is desired region conn = sqs.connect_to_region(self.aws_region, aws_access_key_id=self.aws_key, aws_secret_access_key=self.aws_secret_key) # Create queue queue = conn.get_queue(self.sqs_input_name) queue.set_message_class(JSONMessage) # Populate queue with tasks for task in self.generate_tasks(): #print "enqueueing", task["id"], "size:", task["size"] msg = queue.new_message(body=task) queue.write(msg) conn.close() def setup(self): self.upload_job_bundle() self.put_sqs_tasks() print "Uploaded with job_id: %s" % self.job_id def upload_job_bundle(self): """ Upload job bundle to S3 """ conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.analysis_bucket) k = Key(bucket) k.key = self.s3_code_path k.set_contents_from_filename(self.job_bundle) conn.close()
class AnalysisJob: def __init__(self, cfg): self.job_bundle = cfg.job_bundle if cfg.input_filter: self.input_filter = TelemetrySchema(json.load(open(cfg.input_filter))) else: self.input_filter = None if cfg.input_list_file: self.input_list = cfg.input_list_file else: self.input_list = None self.job_id = str(uuid4()) self.target_queue = cfg.target_queue self.aws_key = cfg.aws_key self.aws_secret_key = cfg.aws_secret_key self.input_bucket = "telemetry-published-v1" self.job_name = cfg.name self.job_owner = cfg.owner self.date_limit = cfg.date_limit # Bucket with intermediate data for this analysis job self.analysis_bucket = "jonasfj-telemetry-analysis" self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz" # S3 region of operation self.aws_region = "us-west-2" self.task_size_limit = 400 * 1024 * 1024 self.sqs_input_name = cfg.sqs_queue def get_filtered_files(self): conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) # date_limit is a hack that makes it easy to launch everything before # a given date... say to back process all we have in the bucket... if self.date_limit != None: print "Launching limiting to before " + self.date_limit for k,s in self.list_partitions(bucket): if k.split('/')[-1].split('.')[1] < self.date_limit: yield (k, s) else: for k,s in self.list_partitions(bucket): yield (k, s) def get_filtered_files_old(self): """ Get tuples of name and size for all input files """ # Setup some auxiliary functions allowed_values = self.input_filter.sanitize_allowed_values() nb_dims = len(allowed_values) def filter_includes(level, value): return self.input_filter.is_allowed(value, allowed_values[level]) # iterate over all files in bucket, this is very slow and we should be # be able to something much smarter using prefix listing and ordering # to break listing. count = 0 selected = 0 conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) for f in bucket.list(): count += 1 dims = self.input_filter.get_dimensions(".", f.key) include = True for i in xrange(nb_dims): if not filter_includes(i, dims[i]): include = False break if include: selected += 1 yield (f.key, f.size) if count % 5000 == 0: print "%i files listed with %i selected" % (count, selected) conn.close() def list_partitions(self, bucket, prefix='', level=0): if self.input_filter: #print "Listing...", prefix, level allowed_values = self.input_filter.sanitize_allowed_values() delimiter = '/' if level > 3: delimiter = '.' for k in bucket.list(prefix=prefix, delimiter=delimiter): partitions = k.name.split("/") if level > 3: # split the last couple of partition components by "." instead of "/" partitions.extend(partitions.pop().split(".", 2)) if self.input_filter.is_allowed(partitions[level], allowed_values[level]): if level >= 5: for f in bucket.list(prefix=k.name): yield (f.key, f.size) else: for k, s in self.list_partitions(bucket, k.name, level + 1): yield (k, s) elif self.input_list: print "Using input list..." for line in self.input_list: key_name = line.strip() k = bucket.get_key(key_name) yield (k.key, k.size) else: print "Don't know how to list partitions without a filter or list :(" raise ValueError("Missing both input_filter and input_list") def generate_tasks(self): """ Generates SQS tasks, we batch small files into a single task """ taskid = 1 taskfiles = [] tasksize = 0 total_size_of_all = 0 for key, size in self.get_filtered_files(): # If the task have reached desired size we yield it # Note, as SQS messages are limited to 65 KiB we limit tasks to # 100 filenames, for simplicity # boto only uses signature version 4, hence, we're limited to 65 KiB if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit or len(taskfiles) > 200): # Reduce to only filenames, sort by size... smallest first they are # faster to download when handling the job taskfiles = [f for f,s in sorted(taskfiles, key=lambda (f,s): s)] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } total_size_of_all += tasksize print "%i tasks created acc. size: %s" % (taskid, total_size_of_all) taskid += 1 taskfiles = [] tasksize = 0 tasksize += size taskfiles.append((key, size)) if len(taskfiles) > 0: taskfiles = [f for f,s in sorted(taskfiles, key=lambda (f,s): s)] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } print "Finished:" print "%i tasks created total size: %s" % (taskid, total_size_of_all + tasksize) def put_sqs_tasks(self): """ Create an SQS tasks for this analysis job """ print "Populate SQS input queue with tasks" # Connect to SQS is desired region conn = sqs.connect_to_region( self.aws_region, aws_access_key_id = self.aws_key, aws_secret_access_key = self.aws_secret_key ) # Create queue queue = conn.get_queue(self.sqs_input_name) queue.set_message_class(JSONMessage) # Populate queue with tasks for task in self.generate_tasks(): #print "enqueueing", task["id"], "size:", task["size"] msg = queue.new_message(body = task) queue.write(msg) conn.close() def setup(self): self.upload_job_bundle() self.put_sqs_tasks() print "Uploaded with job_id: %s" % self.job_id def upload_job_bundle(self): """ Upload job bundle to S3 """ conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.analysis_bucket) k = Key(bucket) k.key = self.s3_code_path k.set_contents_from_filename(self.job_bundle) conn.close()