def export_batch(self, data_dir, conn, bucket, files): print self.label, "Uploading", ",".join(files) if self.dry_run: return 0 # Time the s3funnel call: start = datetime.now() result = subprocess.call(self.s3f_cmd + files, cwd=data_dir) sec = timer.delta_sec(start) total_size = 0 if result == 0: # Success! Verify each file's checksum, then truncate it. for f in files: # Verify checksum and track cumulative size so we can figure out MB/s full_filename = os.path.join(data_dir, f) md5, size = fileutil.md5file(full_filename) total_size += size # f is the key name - it does not include the full path to the # data dir. key = bucket.get_key(f) # Strip quotes from md5 remote_md5 = key.etag[1:-1] if md5 != remote_md5: # TODO: add it to a "failed" queue. print "ERROR: %s failed checksum verification: Local=%s, Remote=%s" % (f, md5, remote_md5) self.bad_records += 1 result = -1 # TODO: else add it to a "succeeded" queue and remove it locally. else: print "Failed to upload one or more files in the current batch. Error code was", result total_mb = float(total_size) / 1024.0 / 1024.0 print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec) return result
def list_files(bucket_name, output_file, output_func=s3obj_to_string, prefix=''): s3 = S3Connection() bucket = s3.get_bucket(bucket_name) total_count = 0 start_time = datetime.now() done = False last_key = '' while not done: try: for k in bucket.list(prefix=prefix, marker=last_key): last_key = k.name total_count += 1 if total_count % 5000 == 0: print "Looked at", total_count, "total records in", timer.delta_sec( start_time), "seconds. Last key was", last_key try: output_file.write(str(output_func(k)) + "\n") except Exception, e: print "Error writing key", k.name, ":", e traceback.print_exc() done = True except socket.error, e: print "Error listing keys:", e traceback.print_exc() print "Continuing from last seen key:", last_key
def fetch_remotes(self, remotes): # TODO: fetch remotes inside Mappers, and process each one as it becomes available. remote_names = [ r["name"] for r in remotes if r["type"] == "remote" ] # TODO: check cache first. result = 0 if len(remote_names) == 0: return result fetch_cwd = os.path.join(self._work_dir, "cache") if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key) start = datetime.now() downloaded_bytes = 0 for local, remote, err in loader.get_list(remote_names): if err is None: print "Downloaded", remote downloaded_bytes += os.path.getsize(local) else: print "Failed to download", remote result += 1 duration_sec = timer.delta_sec(start) downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result
def run_mr(filter, output_file, local_only, streaming): args = { "job_script": "../bucketless_uitour.py", "input_filter": filter, "num_mappers": 16, "num_reducers": 4, "data_dir": "../work/cache", "work_dir": "../work", "output": output_file, "bucket": "telemetry-published-v2", "local_only": local_only, "delete_data": streaming } if not args["local_only"]: if not BOTO_AVAILABLE: print "ERROR: The 'boto' library is required except in 'local-only' mode." print " You can install it using `sudo pip install boto`" parser.print_help() return -2 job = Job(args) start = datetime.now() exit_code = 0 try: job.mapreduce() except: traceback.print_exc(file=sys.stderr) exit_code = 2 duration = timer.delta_sec(start) print "All done in %.2fs" % (duration) return (exit_code, output_file)
def export(self, uploadables): if len(uploadables) == 0: print "Nothing to do!" return 0 print "Found", len(uploadables), "files" fail_count = 0 start = datetime.now() total_size = 0 for local, remote, err in self.s3loader.put_list(uploadables): if err is None: # Great Success! Delete it locally. total_size += os.path.getsize(local) if self.keep_backups: # Keep a copy of the original, just in case. os.rename(local, local + ".uploaded") else: os.remove(local) # Send a message to SQS # TODO: verify that it succeeded. self.enqueue_incoming(remote) else: fail_count += 1 print "Failed to upload '{0}' to bucket {1} as '{2}':".format( local, self.bucket, remote), err sec = timer.delta_sec(start) total_mb = float(total_size) / 1024.0 / 1024.0 print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec) # TODO: log the transfer stats properly. # Return zero for overall success or the number of failures. return fail_count
def main(): parser = argparse.ArgumentParser(description='Run a MapReduce Job.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("job_script", help="The MapReduce script to run") parser.add_argument("-l", "--local-only", help="Only process local files (exclude S3 data)", action="store_true") parser.add_argument("-m", "--num-mappers", metavar="N", help="Start N mapper processes", type=int, default=4) parser.add_argument("-r", "--num-reducers", metavar="N", help="Start N reducer processes", type=int, default=1) parser.add_argument("-d", "--data-dir", help="Base data directory", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key", default=None) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", default=None) parser.add_argument("-w", "--work-dir", help="Location to put temporary work files", default="/tmp/telemetry_mr") parser.add_argument("-o", "--output", help="Filename to use for final job output", required=True) #TODO: make the input filter optional, default to "everything valid" and generate dims intelligently. parser.add_argument("-f", "--input-filter", help="File containing filter spec", required=True) parser.add_argument("-v", "--verbose", help="Print verbose output", action="store_true") args = parser.parse_args() if not args.local_only: if not BOTO_AVAILABLE: print "ERROR: The 'boto' library is required except in 'local-only' mode." print " You can install it using `sudo pip install boto`" parser.print_help() sys.exit(-2) # If we want to process remote data, some more arguments are required. for remote_req in ["bucket"]: if not hasattr(args, remote_req) or getattr(args, remote_req) is None: print "ERROR:", remote_req, "is a required option" parser.print_help() sys.exit(-1) job = Job(args) start = datetime.now() job.mapreduce() duration = timer.delta_sec(start) print "All done in %.2fs" % (duration)
def get_filtered_files_s3(self): out_files = [] if not self._local_only: print "Fetching file list from S3..." # Plain boto should be fast enough to list bucket contents. if self._aws_key is not None: conn = S3Connection(self._aws_key, self._aws_secret_key) else: conn = S3Connection() bucket = conn.get_bucket(self._bucket_name) start = datetime.now() count = 0 # Filter input files by partition. If the filter is reasonably # selective, this can be much faster than listing all files in the # bucket. for f in s3util.list_partitions(bucket, schema=self._input_filter, include_keys=True): count += 1 out_files.append(f) if count == 1 or count % 1000 == 0: print "Listed", count, "so far" conn.close() duration = timer.delta_sec(start) print "Listed", len(out_files), "files in", duration, "seconds" return out_files
class ExportCompressedStep(PipeStep): def __init__(self, num, name, q_in, log_file, stats_file, base_dir, config, dry_run): self.dry_run = dry_run self.base_dir = base_dir self.aws_key = config.get("aws_key", None) self.aws_secret_key = config.get("aws_secret_key", None) self.aws_bucket_name = config["publish_bucket"] PipeStep.__init__(self, num, name, q_in, log_file=log_file, stats_file=stats_file) def setup(self): if self.dry_run: self.conn = None self.bucket = None return self.conn = S3Connection(self.aws_key, self.aws_secret_key) self.bucket = self.conn.get_bucket(self.aws_bucket_name) def strip_data_dir(self, data_dir, full_file): if full_file.startswith(data_dir): chopped = full_file[len(data_dir):] if chopped[0] == "/": chopped = chopped[1:] return chopped else: raise ValueError("Invalid full filename: " + str(full_file)) def handle(self, record): try: # Remove the output dir prefix from filenames stripped_name = self.strip_data_dir(self.base_dir, record) except ValueError, e: self.log("Warning: couldn't strip base dir from '{0}' " \ "{1}".format(record, e)) stripped_name = record self.log("Uploading {0}".format(stripped_name)) start = now() if self.dry_run: local_filename = record remote_filename = stripped_name err = None else: local_filename, remote_filename, err = s3util.upload_one([ self.base_dir, self.bucket, stripped_name]) sec = timer.delta_sec(start, now()) current_size = os.path.getsize(record) self.stats.increment(records_read=1, bytes_read=current_size) if err is None: # Everything went well. self.stats.increment(records_written=1, bytes_written=current_size) # Delete local files once they've been uploaded successfully. if not self.dry_run: try: os.remove(record) self.log("Removed uploaded file {0}".format(record)) except Exception, e: self.log("Failed to remove uploaded file {0}: " \ "{1}".format(record, e))
def fetch_remotes(self, remotes): # TODO: fetch remotes inside Mappers, and process each one as it becomes available. remote_names = [r["name"] for r in remotes if r["type"] == "remote"] # TODO: check cache first. result = 0 if len(remote_names) == 0: return result fetch_cwd = os.path.join(self._work_dir, "cache") if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key) start = datetime.now() downloaded_bytes = 0 for local, remote, err in loader.get_list(remote_names): if err is None: print "Downloaded", remote downloaded_bytes += os.path.getsize(local) else: print "Failed to download", remote result += 1 duration_sec = timer.delta_sec(start) downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % ( downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result
def export(self, uploadables): if len(uploadables) == 0: print "Nothing to do!" return 0 print "Found", len(uploadables), "files" fail_count = 0 start = datetime.now() total_size = 0 for local, remote, err in self.s3loader.put_list(uploadables): if err is None: # Great Success! Delete it locally. total_size += os.path.getsize(local) if self.keep_backups: # Keep a copy of the original, just in case. os.rename(local, local + ".uploaded") else: os.remove(local) # Send a message to SQS # TODO: verify that it succeeded. self.enqueue_incoming(remote) else: fail_count += 1 print "Failed to upload '{0}' to bucket {1} as '{2}':".format(local, self.bucket, remote), err sec = timer.delta_sec(start) total_mb = float(total_size) / 1024.0 / 1024.0 print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec) # TODO: log the transfer stats properly. # Return zero for overall success or the number of failures. return fail_count
def run_mr(filter, output_file, local_only): args = { "job_script" : "../uitour.py", "input_filter": filter, "num_mappers" : 16, "num_reducers" : 4, "data_dir" : "../work/cache", "work_dir" : "../work", "output" : output_file, "bucket" : "telemetry-published-v2", "local_only" : local_only } if not args["local_only"]: if not BOTO_AVAILABLE: print "ERROR: The 'boto' library is required except in 'local-only' mode." print " You can install it using `sudo pip install boto`" parser.print_help() return -2 job = Job(args) start = datetime.now() exit_code = 0 try: job.mapreduce() except: traceback.print_exc(file=sys.stderr) exit_code = 2 duration = timer.delta_sec(start) print "All done in %.2fs" % (duration) return (exit_code, output_file)
def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket): self.work_dir = work_dir print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete." bytes_total = sum([f.size for f in inputs]) bytes_completed = 0 next_notice_pct = 10 start = datetime.now() loader = None output_file = os.path.join(work_dir, "mapper_" + str(mapper_id)) mapfunc = getattr(module, 'map', None) context = Context(output_file, partition_count) if not callable(mapfunc): print "No map function!!!" sys.exit(1) # TODO: Stream/decompress the files directly. for input_file in inputs: if input_file.remote: # TODO: check if the file already exists locally. # Lazy load the loader (so we don't do it on "local only" jobs). if loader is None: loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1) for local, remote, err in loader.get_list([input_file.name]): if err is not None: print "Failed to download", remote, ":", err try: handle = self.open_input_file(input_file) except: print "Error opening", input_file.name, "(skipping)" traceback.print_exc(file=sys.stderr) continue line_num = 0 for line in handle: line_num += 1 try: # Remove the trailing EOL character(s) before passing to # the map function. key, value = line.rstrip('\r\n').split("\t", 1) mapfunc(key, input_file.dimensions, value, context) except ValueError, e: # TODO: increment "bad line" metrics. print "Bad line:", input_file.name, ":", line_num, e handle.close() if delete_files: print "Removing", input_file.name os.remove(handle.filename) bytes_completed += input_file.size completed_pct = (float(bytes_completed) / bytes_total) * 100 if completed_pct >= next_notice_pct: next_notice_pct += 10 duration_sec = timer.delta_sec(start) completed_mb = float(bytes_completed) / 1024.0 / 1024.0 print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
def dump_stats(self): duration = timer.delta_sec(self.start_time, self.end_time) read_rate = self.records_read / duration mb_read = self.bytes_read / 1024.0 / 1024.0 mb_read_rate = mb_read / duration write_rate = self.records_written / duration mb_written = self.bytes_written / 1024.0 / 1024.0 mb_write_rate = mb_written / duration print "%s: Read %d records or %.2fMB (%.2fr/s, %.2fMB/s), wrote %d or %.2f MB (%.2fr/s, %.2fMB/s). Found %d bad records" % (self.label, self.records_read, mb_read, read_rate, mb_read_rate, self.records_written, mb_written, write_rate, mb_write_rate, self.bad_records)
def handle(self, record): filename = record base_ends = filename.find(".log") + 4 if base_ends < 4: self.log("Bad filename encountered, skipping: " + filename) self.stats.increment(records_read=1, bad_records=1, bad_record_type="bad_filename") return basename = filename[0:base_ends] # Get a unique name for the compressed file: comp_name = basename + "." + uuid.uuid4( ).hex + StorageLayout.COMPRESSED_SUFFIX # reserve it! f_comp = open(comp_name, "wb") # TODO: open f_comp with same buffer size as below? # Rename uncompressed file to a temp name tmp_name = comp_name + ".compressing" os.rename(filename, tmp_name) # Read input file as text (line-buffered) f_raw = open(tmp_name, "r", 1) start = now() # Now set up our processing pipe: # - read from f_raw, compress, write to comp_name p_compress = Popen(self.compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr) # Note: it looks like p_compress.wait() is what we want, but the docs # warn of a deadlock, so we use communicate() instead. p_compress.communicate() raw_bytes = f_raw.tell() comp_bytes = f_comp.tell() raw_mb = float(raw_bytes) / 1024.0 / 1024.0 comp_mb = float(comp_bytes) / 1024.0 / 1024.0 f_raw.close() f_comp.close() self.stats.increment(records_read=1, records_written=1, bytes_read=raw_bytes, bytes_written=comp_bytes) # Remove raw file os.remove(tmp_name) sec = timer.delta_sec(start, now()) self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \ " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name, sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
def import_files(self, input_directory): begin = datetime.now() processes = [] self._enqueue_process(partial(self._master, input_directory), processes) for worker in range(0, self._n_workers): self._enqueue_process(partial(self._worker), processes) for p in processes: p.join() print("Files imported in", timer.delta_sec(begin), "seconds.")
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key): result = 0 if len(incoming_files) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) files = [] for f in incoming_files: full_filename = os.path.join(fetch_cwd, f) if os.path.isfile(full_filename): md5, size = fileutil.md5file(full_filename) # f is the key name - it does not include the full path to the # data dir. key = bucket.get_key(f) # Strip quotes from md5 remote_md5 = key.etag[1:-1] if md5 != remote_md5: files.append(f) else: print "Already downloaded", f else: files.append(f) fetch_cmd = [S3FUNNEL_PATH] fetch_cmd.append(bucket.name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(aws_key) fetch_cmd.append("-s") fetch_cmd.append(aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") # Fetch in batches of 8 files at a time while len(files) > 0: current_files = files[0:8] files = files[8:] start = datetime.now() result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd) duration_sec = timer.delta_sec(start) # TODO: verify MD5s downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in current_files ]) downloaded_mb = downloaded_bytes / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % ( downloaded_mb, duration_sec, downloaded_mb / duration_sec) if result != 0: break return result
def handle(self, record): filename = record base_ends = filename.find(".log") + 4 if base_ends < 4: self.log("Bad filename encountered, skipping: " + filename) self.stats.increment(records_read=1, bad_records=1, bad_record_type="bad_filename") return basename = filename[0:base_ends] # Get a unique name for the compressed file: comp_name = basename + "." + uuid.uuid4().hex + StorageLayout.COMPRESSED_SUFFIX # reserve it! f_comp = open(comp_name, "wb") # TODO: open f_comp with same buffer size as below? # Rename uncompressed file to a temp name tmp_name = comp_name + ".compressing" os.rename(filename, tmp_name) # Read input file as text (line-buffered) f_raw = open(tmp_name, "r", 1) start = now() # Now set up our processing pipe: # - read from f_raw, compress, write to comp_name p_compress = Popen(self.compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr) # Note: it looks like p_compress.wait() is what we want, but the docs # warn of a deadlock, so we use communicate() instead. p_compress.communicate() raw_bytes = f_raw.tell() comp_bytes = f_comp.tell() raw_mb = float(raw_bytes) / 1024.0 / 1024.0 comp_mb = float(comp_bytes) / 1024.0 / 1024.0 f_raw.close() f_comp.close() self.stats.increment(records_read=1, records_written=1, bytes_read=raw_bytes, bytes_written=comp_bytes) # Remove raw file os.remove(tmp_name) sec = timer.delta_sec(start, now()) self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \ " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name, sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
def get_summary(self): duration = timer.delta_sec(self.start_time, self.end_time) read_rate = self.overall["records_read"] / duration mb_read = self.overall["bytes_read"] / 1024.0 / 1024.0 mb_read_rate = mb_read / duration write_rate = self.overall["records_written"] / duration mb_written = self.overall["bytes_written"] / 1024.0 / 1024.0 mb_write_rate = mb_written / duration summary = "Read %d records or %.2fMB (%.2fr/s, %.2fMB/s), " \ "wrote %d or %.2f MB (%.2fr/s, %.2fMB/s). " \ "Found %d bad records" % (self.overall["records_read"], mb_read, read_rate, mb_read_rate, self.overall["records_written"], mb_written, write_rate, mb_write_rate, self.overall["bad_records"]) return summary
def save_map(self, channel_name, chan_stats): if self.stats_file is None: return; chan_stats["task"] = self.task chan_stats["channel"] = channel_name chan_stats["start_time"] = datetime_to_json(self.start_time) chan_stats["end_time"] = datetime_to_json(self.end_time) chan_stats["duration"] = timer.delta_sec(self.start_time, self.end_time) try: with io.open(self.stats_file, "a") as fout: fout.write(unicode(json.dumps(chan_stats) + u"\n")) except: self.logger.log("Error writing '{}' stats".format(channel_name)) self.logger.log(traceback.format_exc())
def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket): self.work_dir = work_dir print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete." bytes_total = sum([f.size for f in inputs]) bytes_completed = 0 next_notice_pct = 5 start = datetime.now() loader = None output_file = os.path.join(work_dir, "mapper_" + str(mapper_id)) mapfunc = getattr(module, 'map', None) context = Context(output_file, partition_count) if not callable(mapfunc): print "No map function!!!" sys.exit(1) for input_file in inputs: if input_file.remote: # Lazy load the loader (so we don't do it on "local only" jobs). if loader is None: loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1) for local, remote, err in loader.get_list([input_file.name]): if err is not None: print "Failed to download", remote, ":", err line_num = 0 full_filename = os.path.join(self.work_dir, "cache", input_file.name) for r, _ in heka_message.unpack_file(full_filename): msg = heka_message_parser.parse_heka_record(r) line_num += 1 try: mapfunc(msg["meta"]["documentId"], msg, context) except ValueError, e: # TODO: increment "bad line" metrics. print "Bad record:", input_file.name, ":", line_num, e if delete_files: os.remove(full_filename) bytes_completed += input_file.size completed_pct = (float(bytes_completed) / bytes_total) * 100 if completed_pct >= next_notice_pct: next_notice_pct += 5 duration_sec = timer.delta_sec(start) completed_mb = float(bytes_completed) / 1024.0 / 1024.0 print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
def handle(self, record): filename = record base_ends = filename.find(".log") + 4 if base_ends < 4: self.log("Bad filename encountered, skipping: " + filename) self.stats.increment(records_read=1, bad_records=1, bad_record_type="bad_filename") return basename = filename[0:base_ends] # Get a unique name for the compressed file: comp_name = basename + "." + uuid.uuid4( ).hex + StorageLayout.COMPRESSED_SUFFIX comp_file = CompressedFile(comp_name, mode="w", open_now=True, compression_level=1) # Rename uncompressed file to a temp name tmp_name = comp_name + ".compressing" os.rename(filename, tmp_name) start = now() try: comp_file.compress_from(tmp_name, remove_original=False) comp_file.close() except Exception as e: self.stats.increment(records_read=1, bad_records=1, bad_record_type="compression_error") self.log("Error compressing file {0}: {1}".format(filename, e)) return raw_bytes = os.stat(tmp_name).st_size comp_bytes = os.stat(comp_name).st_size raw_mb = float(raw_bytes) / 1024.0 / 1024.0 comp_mb = float(comp_bytes) / 1024.0 / 1024.0 self.stats.increment(records_read=1, records_written=1, bytes_read=raw_bytes, bytes_written=comp_bytes) # Remove raw file os.remove(tmp_name) sec = timer.delta_sec(start, now()) self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \ " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name, sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key): result = 0 if len(incoming_files) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) files = [] for f in incoming_files: full_filename = os.path.join(fetch_cwd, f) if os.path.isfile(full_filename): md5, size = fileutil.md5file(full_filename) # f is the key name - it does not include the full path to the # data dir. key = bucket.get_key(f) # Strip quotes from md5 remote_md5 = key.etag[1:-1] if md5 != remote_md5: files.append(f) else: print "Already downloaded", f else: files.append(f) fetch_cmd = [S3FUNNEL_PATH] fetch_cmd.append(bucket.name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(aws_key) fetch_cmd.append("-s") fetch_cmd.append(aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") # Fetch in batches of 8 files at a time while len(files) > 0: current_files = files[0:8] files = files[8:] start = datetime.now() result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd) duration_sec = timer.delta_sec(start) # TODO: verify MD5s downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in current_files ]) downloaded_mb = downloaded_bytes / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec) if result != 0: break return result
def work(self): print self.label, "Starting up" while True: try: raw = self.q_in.get() if raw == PipeStep.SENTINEL: break self.handle(raw) self.records_read += 1 if self.print_stats: this_update = datetime.now() if timer.delta_sec(self.last_update, this_update) > 10.0: self.last_update = this_update self.dump_stats() self.end_time = datetime.now() except Q.Empty: break print self.label, "Received stop message... all done"
def main(): parser = argparse.ArgumentParser(description='Run a MapReduce Job.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("job_script", help="The MapReduce script to run") parser.add_argument("-l", "--local-only", help="Only process local files (exclude S3 data)", action="store_true") parser.add_argument("-m", "--num-mappers", metavar="N", help="Start N mapper processes", type=int, default=4) parser.add_argument("-r", "--num-reducers", metavar="N", help="Start N reducer processes", type=int, default=1) parser.add_argument("-d", "--data-dir", help="Base data directory", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key", default=None) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", default=None) parser.add_argument("-w", "--work-dir", help="Location to put temporary work files", default="/tmp/telemetry_mr") parser.add_argument("-o", "--output", help="Filename to use for final job output", required=True) #TODO: make the input filter optional, default to "everything valid" and generate dims intelligently. parser.add_argument("-f", "--input-filter", help="File containing filter spec", required=True) parser.add_argument("-v", "--verbose", help="Print verbose output", action="store_true") parser.add_argument("-X", "--delete-data", help="Delete raw data files after mapping", action="store_true") parser.add_argument("-p", "--profile", help="Profile mappers and reducers using cProfile", action="store_true") args = parser.parse_args() if not args.local_only: if not BOTO_AVAILABLE: print "ERROR: The 'boto' library is required except in 'local-only' mode." print " You can install it using `sudo pip install boto`" parser.print_help() return -2 # If we want to process remote data, some more arguments are required. for remote_req in ["bucket"]: if not hasattr(args, remote_req) or getattr(args, remote_req) is None: print "ERROR:", remote_req, "is a required option" parser.print_help() return -1 args = args.__dict__ job = Job(args) start = datetime.now() exit_code = 0 try: job.mapreduce() except: traceback.print_exc(file=sys.stderr) exit_code = 2 duration = timer.delta_sec(start) print "All done in %.2fs" % (duration) return exit_code
def work(self): self.log("Starting up") while True: try: raw = self.q_in.get() if raw == PipeStep.SENTINEL: break self.stats.reset() self.handle(raw) self.stats.update_end_time() self.stats.save() if self.print_stats: this_update = now() if timer.delta_sec(self.last_update, this_update) > 10.0: self.last_update = this_update self.log(self.stats.get_summary()) except Q.Empty: break self.log("Received stop message... work done")
def handle(self, record): filename = record base_ends = filename.find(".log") + 4 if base_ends < 4: self.log("Bad filename encountered, skipping: " + filename) self.stats.increment(records_read=1, bad_records=1, bad_record_type="bad_filename") return basename = filename[0:base_ends] # Get a unique name for the compressed file: comp_name = basename + "." + uuid.uuid4().hex + StorageLayout.COMPRESSED_SUFFIX comp_file = CompressedFile(comp_name, mode="w", open_now=True, compression_level=1) # Rename uncompressed file to a temp name tmp_name = comp_name + ".compressing" os.rename(filename, tmp_name) start = now() try: comp_file.compress_from(tmp_name, remove_original=False) comp_file.close() except Exception as e: self.stats.increment(records_read=1, bad_records=1, bad_record_type="compression_error") self.log("Error compressing file {0}: {1}".format(filename, e)) return raw_bytes = os.stat(tmp_name).st_size comp_bytes = os.stat(comp_name).st_size raw_mb = float(raw_bytes) / 1024.0 / 1024.0 comp_mb = float(comp_bytes) / 1024.0 / 1024.0 self.stats.increment(records_read=1, records_written=1, bytes_read=raw_bytes, bytes_written=comp_bytes) # Remove raw file os.remove(tmp_name) sec = timer.delta_sec(start, now()) self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \ " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name, sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
def get_filtered_files_s3(self): if not self._local_only: print "Fetching file list from S3..." # Plain boto should be fast enough to list bucket contents. if self._aws_key is not None: conn = S3Connection(self._aws_key, self._aws_secret_key) else: conn = S3Connection() bucket = conn.get_bucket(self._bucket_name) start = datetime.now() count = 0 # Filter input files by partition. If the filter is reasonably # selective, this can be much faster than listing all files in the # bucket. for f in s3util.list_partitions(bucket, schema=self._input_filter, include_keys=True): count += 1 if count == 1 or count % 1000 == 0: print "Listed", count, "so far" yield f conn.close() duration = timer.delta_sec(start) print "Listed", count, "files in", duration, "seconds"
def fetch_s3_files(files, fetch_cwd, bucket_name, aws_key, aws_secret_key): result = 0 if len(files) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) fetch_cmd = ["/usr/local/bin/s3funnel"] fetch_cmd.append(bucket_name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(aws_key) fetch_cmd.append("-s") fetch_cmd.append(aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") start = datetime.now() result = subprocess.call(fetch_cmd + files, cwd=fetch_cwd) duration_sec = timer.delta_sec(start) # TODO: verify MD5s downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in files ]) downloaded_mb = downloaded_bytes / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result
def fetch_s3_files(files, fetch_cwd, bucket_name, aws_key, aws_secret_key): result = 0 if len(files) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) fetch_cmd = ["/usr/local/bin/s3funnel"] fetch_cmd.append(bucket_name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(aws_key) fetch_cmd.append("-s") fetch_cmd.append(aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") start = datetime.now() result = subprocess.call(fetch_cmd + files, cwd=fetch_cwd) duration_sec = timer.delta_sec(start) # TODO: verify MD5s downloaded_bytes = sum( [os.path.getsize(os.path.join(fetch_cwd, f)) for f in files]) downloaded_mb = downloaded_bytes / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % ( downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result
def list_files(bucket_name, output_file, output_func=s3obj_to_string, prefix=''): s3 = S3Connection() bucket = s3.get_bucket(bucket_name) total_count = 0 start_time = datetime.now() done = False last_key = '' while not done: try: for k in bucket.list(prefix=prefix, marker=last_key): last_key = k.name total_count += 1 if total_count % 5000 == 0: print "Looked at", total_count, "total records in", timer.delta_sec(start_time), "seconds. Last key was", last_key try: output_file.write(str(output_func(k)) + "\n") except Exception, e: print "Error writing key", k.name, ":", e traceback.print_exc() done = True except socket.error, e: print "Error listing keys:", e traceback.print_exc() print "Continuing from last seen key:", last_key
class ReadRawStep(PipeStep): UUID_ONLY_PATH = re.compile( '^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$') def __init__(self, num, name, raw_files, completed_files, log_file, stats_file, schema, converter, storage, bad_filename): self.schema = schema self.converter = converter self.storage = storage self.bad_filename = bad_filename PipeStep.__init__(self, num, name, raw_files, completed_files, log_file, stats_file) def setup(self): self.expected_dim_count = len(self.schema._dimensions) def handle(self, raw_file): self.log("Reading " + raw_file) try: record_count = 0 bytes_read = 0 start = now() file_version = fileutil.detect_file_version(raw_file, simple_detection=True) self.log("Detected version {0} for file {1}".format( file_version, raw_file)) for unpacked in fileutil.unpack(raw_file, file_version=file_version): record_count += 1 common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] current_bytes = common_bytes + unpacked.len_data current_bytes_uncompressed = common_bytes + len(unpacked.data) bytes_read += current_bytes if unpacked.error: self.log("ERROR: Found corrupted data for record {0} in " \ "{1} path: {2} Error: {3}".format(record_count, raw_file, unpacked.path, unpacked.error)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="corrupted_data") continue if len(unpacked.data) == 0: self.log("WARN: Found empty data for record {0} in " \ "{2} path: {2}".format(record_count, raw_file, unpacked.path)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="empty_data") continue submission_date = ts_to_yyyymmdd(unpacked.timestamp) path = unicode(unpacked.path, errors="replace") if unpacked.data[0] != "{": # Data looks weird, should be JSON. self.log("Warning: Found unexpected data for record {0}" \ " in {1} path: {2} data:\n{3}".format(record_count, raw_file, path, unpacked.data)) else: # Raw JSON, make sure we treat it as unicode. unpacked.data = unicode(unpacked.data, errors="replace") path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. bad_record_type = "invalid_path" if ReadRawStep.UUID_ONLY_PATH.match(path): bad_record_type = "uuid_only_path" else: self.log("Found an invalid path in record {0}: " \ "{1}".format(record_count, path)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type=bad_record_type) continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) channel = self.schema.get_field(dims, "appUpdateChannel", True, True) self.stats.increment( channel=channel, records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed) try: # Convert data: if self.converter is None: serialized_data = unpacked.data # TODO: Converter.VERSION_UNCONVERTED data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json( unpacked.data, dims[-1], unpacked.ip) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: self.log("Record {0} mismatched dimension " \ "{1}: '{2}' != '{3}'".format( record_count, i, dims[1], parsed_dims[i])) serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims # TODO: Converter.VERSION_CONVERTED data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.stats.increment(channel=channel, records_written=1, bytes_written=len(key) + len(serialized_data) + 2) # Compress rotated files as we generate them if n.endswith( StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:", "write_failed") except BadPayloadError, e: self.write_bad_record(key, dims, unpacked.data, e.msg, "Bad Payload:", "bad_payload") except Exception, e: err_message = str(e) if err_message == "Missing in payload: info.revision": # We don't need to write these bad records out - we know # why they are being skipped. self.stats.increment( channel=channel, bad_records=1, bad_record_type="missing_revision") elif err_message == "Invalid revision URL: /rev/": # We do want to log these payloads, but we don't want # the full stack trace. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "missing_revision_repo") # Don't split this long string - we want to be able to find it in the code elif err_message.startswith( "JSONDecodeError: Invalid control character"): self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "invalid_control_char") else: # TODO: Recognize other common failure modes and handle # them gracefully. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "conversion_error") self.log(traceback.format_exc()) if self.print_stats: this_update = now() sec = timer.delta_sec(self.last_update, this_update) if sec > 10.0: self.last_update = this_update self.log(self.stats.get_summary())
def main(): parser = argparse.ArgumentParser( description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files") parser.add_argument("publish_bucket", help="The S3 bucket to save processed files") parser.add_argument("-k", "--aws-key", help="AWS Key", required=True) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True) parser.add_argument("-r", "--aws-region", help="AWS Region", default="us-west-2") parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True) parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True) parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file) parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file") parser.add_argument("-q", "--queue", help="SQS Queue name to poll for incoming data") parser.add_argument("-c", "--histogram-cache-path", help="Path to store a local cache of histograms", default="./histogram_cache") parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-D", "--dry-run", help="Don't modify remote files", action="store_true") parser.add_argument("-C", "--skip-conversion", help="Skip validation/conversion of payloads", action="store_true") args = parser.parse_args() if not os.path.isfile(S3FUNNEL_PATH): print "ERROR: s3funnel not found at", S3FUNNEL_PATH print "You can get it from github: https://github.com/sstoiana/s3funnel" return -1 if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org") if args.skip_conversion: converter = None else: converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, args.max_output_size) num_cpus = multiprocessing.cpu_count() start = datetime.now() conn = None incoming_bucket = None incoming_queue = None incoming_queue_messages = [] if not args.dry_run: conn = S3Connection(args.aws_key, args.aws_secret_key) incoming_bucket = conn.get_bucket(args.incoming_bucket) incoming_filenames = [] if args.queue is not None: print "Fetching file list from queue", args.queue if args.dry_run: print "Dry run mode... can't read from the queue without messing things up..." else: q_conn = boto.sqs.connect_to_region( args.aws_region, aws_access_key_id=args.aws_key, aws_secret_access_key=args.aws_secret_key) incoming_queue = q_conn.get_queue(args.queue) if incoming_queue is None: print "Error: could not get queue", args.queue return -2 # Sometimes we don't get all the messages, even if more are # available, so keep trying until we have enough (or there aren't # any left) for i in range(num_cpus): messages = incoming_queue.get_messages(num_cpus - len(incoming_filenames)) for m in messages: # TODO: Make sure this file exists in S3 first? possible_filename = m.get_body() key = incoming_bucket.get_key(possible_filename) if key is None: print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename # try to delete it: incoming_queue.delete_message(m) else: incoming_filenames.append(possible_filename) incoming_queue_messages.append(m) if len(messages) == 0 or len(incoming_filenames) >= num_cpus: break elif args.input_files: print "Fetching file list from file", args.input_files incoming_filenames = [l.strip() for l in args.input_files.readlines()] else: print "Fetching file list from S3..." for f in incoming_bucket.list(): incoming_filenames.append(f.name) print "Done" if len(incoming_filenames) == 0: print "Nothing to do!" return 0 for f in incoming_filenames: print " ", f print "Verifying that we can write to", args.publish_bucket if args.dry_run: print "Dry run mode: don't care!" else: try: publish_bucket = conn.get_bucket(args.publish_bucket) print "Looks good!" except S3ResponseError: print "Bucket", args.publish_bucket, "not found. Attempting to create it." publish_bucket = conn.create_bucket(args.publish_bucket) result = 0 print "Downloading", len(incoming_filenames), "files..." if args.dry_run: print "Dry run mode: skipping download from S3" else: result = fetch_s3_files(incoming_filenames, args.work_dir, incoming_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error downloading files. Return code of s3funnel was", result return result print "Done" after_download = datetime.now() local_filenames = [ os.path.join(args.work_dir, f) for f in incoming_filenames ] # TODO: try a SimpleQueue raw_files = Queue() for l in local_filenames: raw_files.put(l) completed_files = Queue() compressed_files = Queue() # Begin reading raw input raw_readers = start_workers( num_cpus, "Reader", ReadRawStep, raw_files, (completed_files, schema, converter, storage, args.bad_data_log)) # Tell readers when to stop: for i in range(num_cpus): raw_files.put(PipeStep.SENTINEL) # Compress completed files. compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep, completed_files, (compressed_files, )) # Export compressed files to S3. exporters = start_workers( num_cpus, "Exporter", ExportCompressedStep, compressed_files, (args.output_dir, args.aws_key, args.aws_secret_key, args.publish_bucket, args.dry_run)) wait_for(raw_readers, "Raw Readers") # `find <out_dir> -type f -not -name ".compressme"` # Add them to completed_files for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(".log"): completed_files.put(os.path.join(root, f)) for i in range(num_cpus): completed_files.put(PipeStep.SENTINEL) wait_for(compressors, "Compressors") for i in range(num_cpus): compressed_files.put(PipeStep.SENTINEL) wait_for(exporters, "Exporters") print "Removing processed logs from S3..." for f in incoming_filenames: if args.dry_run: print " Dry run, so not really deleting", f else: print " Deleting", f incoming_bucket.delete_key(f) # Delete file locally too. os.remove(os.path.join(args.work_dir, f)) print "Done" if len(incoming_queue_messages) > 0: print "Removing processed messages from SQS..." for m in incoming_queue_messages: if args.dry_run: print " Dry run, so not really deleting", m.get_body() else: print " Deleting", m.get_body() if incoming_queue.delete_message(m): print " Message deleted successfully" else: print " Failed to delete message :(" print "Done" duration = timer.delta_sec(start) print "All done in %.2fs (%.2fs excluding download time)" % ( duration, timer.delta_sec(after_download)) return 0
def main(): parser = argparse.ArgumentParser( description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000 ) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file): record_count += 1 if err: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(path, errors="replace") data = unicode(data, errors="replace") bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) # print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count, ) return 0
total_count += 1 if total_count % 5000 == 0: print "Looked at", total_count, "total records in", timer.delta_sec(start_time), "seconds. Last key was", last_key try: output_file.write(str(output_func(k)) + "\n") except Exception, e: print "Error writing key", k.name, ":", e traceback.print_exc() done = True except socket.error, e: print "Error listing keys:", e traceback.print_exc() print "Continuing from last seen key:", last_key output_file.close() print "Overall, listed", total_count, "in", timer.delta_sec(start_time), "seconds" def main(): parser = argparse.ArgumentParser(description="List S3 contents (with retry) to a file") parser.add_argument("--output-file", type=argparse.FileType('w')) parser.add_argument("--bucket", default="telemetry-published-v2") parser.add_argument("--prefix", default="") parser.add_argument("--verbose", action="store_true") parser.add_argument("--debug", action="store_true") args = parser.parse_args() if args.debug: boto.set_stream_logger('boto') list_files(args.bucket, args.output_file, prefix=args.prefix)
"Conversion Error", "invalid_control_char") else: # TODO: Recognize other common failure modes and handle # them gracefully. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "conversion_error") self.log(traceback.format_exc()) if self.print_stats: this_update = now() sec = timer.delta_sec(self.last_update, this_update) if sec > 10.0: self.last_update = this_update self.log(self.stats.get_summary()) duration = timer.delta_sec(start, now()) mb_read = bytes_read / 1024.0 / 1024.0 # Stats for the current file: self.log("Read %d records %.2fMB in %.2fs (%.2fMB/s)" % ( record_count, mb_read, duration, mb_read / duration)) except Exception, e: # Corrupted data, let's skip this record. self.log("Error reading raw data from {0} {1}\n{2}".format( raw_file, e, traceback.format_exc())) def write_bad_record(self, key, dims, data, error, message=None, bad_record_type=None): try: channel = self.schema.get_field(dims, "appUpdateChannel", True, True)
# why they are being skipped. if err_message != "Missing in payload: info.revision": # TODO: recognize other common failure modes and handle them gracefully. self.write_bad_record(key, dims, data, err_message, "Conversion Error:") traceback.print_exc() if self.print_stats: this_update = datetime.now() sec = timer.delta_sec(self.last_update, this_update) if sec > 10.0: self.last_update = this_update self.end_time = datetime.now() self.dump_stats() duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 # Stats for the current file: print self.label, "- Read %d records %.2fMB in %.2fs (%.2fMB/s)" % ( record_count, mb_read, duration, mb_read / duration) except Exception, e: # Corrupted data, let's skip this record. print self.label, "- Error reading raw data from ", raw_file, e traceback.print_exc() def write_bad_record(self, key, dims, data, error, message=None): self.bad_records += 1 if message is not None: print self.label, message, error if self.bad_filename is not None: try:
continue exp_count += 1 total_bytes += k.size last_key = k.name if total_count % 100 == 0: logger.debug("Expired {} of {} total files in {}s. Last key was {}".format( exp_count, total_count, timer.delta_sec(start_time), last_key)) logger.info("Deleting {} from S3 bucket".format(k.name)) sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name) if should_run(args.dry_run, logger, "Deleting from S3 bucket"): k.delete() if should_run(args.dry_run, logger, "Notifying coordinator"): db_cursor.execute(sql_update) db_conn.commit() logger.debug("Coordinator notified") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.debug("Continuing from last seen key: {}".format(last_key)) if db_conn is not None: db_conn.close() total_mb = round(total_bytes / 1024.0 / 1024.0, 2) logger.info("Overall, expired {} of {} files ({} MB) in {} seconds.".format( exp_count, total_count, total_mb, timer.delta_sec(start_time))) return 0 if __name__ == "__main__": sys.exit(main())
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(r.path, errors="replace") data = unicode(r.data, errors="replace") bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0
# We don't need to write these bad records out - we know # why they are being skipped. if err_message != "Missing in payload: info.revision": # TODO: recognize other common failure modes and handle them gracefully. self.write_bad_record(key, dims, data, err_message, "Conversion Error:") traceback.print_exc() if self.print_stats: this_update = datetime.now() sec = timer.delta_sec(self.last_update, this_update) if sec > 10.0: self.last_update = this_update self.end_time = datetime.now() self.dump_stats() duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 # Stats for the current file: print self.label, "- Read %d records %.2fMB in %.2fs (%.2fMB/s)" % (record_count, mb_read, duration, mb_read / duration) except Exception, e: # Corrupted data, let's skip this record. print self.label, "- Error reading raw data from ", raw_file, e traceback.print_exc() def write_bad_record(self, key, dims, data, error, message=None): self.bad_records += 1 if message is not None: print self.label, message, error if self.bad_filename is not None: try: path = u"/".join([key] + dims)
if total_count % 5000 == 0: print "Looked at", total_count, "total records in", timer.delta_sec( start_time), "seconds. Last key was", last_key try: output_file.write(str(output_func(k)) + "\n") except Exception, e: print "Error writing key", k.name, ":", e traceback.print_exc() done = True except socket.error, e: print "Error listing keys:", e traceback.print_exc() print "Continuing from last seen key:", last_key output_file.close() print "Overall, listed", total_count, "in", timer.delta_sec( start_time), "seconds" def main(): parser = argparse.ArgumentParser( description="List S3 contents (with retry) to a file") parser.add_argument("--output-file", type=argparse.FileType('w')) parser.add_argument("--bucket", default="telemetry-published-v2") parser.add_argument("--prefix", default="") parser.add_argument("--verbose", action="store_true") parser.add_argument("--debug", action="store_true") args = parser.parse_args() if args.debug: boto.set_stream_logger('boto')
else: # TODO: Recognize other common failure modes and handle # them gracefully. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "conversion_error") self.log(traceback.format_exc()) if self.print_stats: this_update = now() sec = timer.delta_sec(self.last_update, this_update) if sec > 10.0: self.last_update = this_update self.log(self.stats.get_summary()) duration = timer.delta_sec(start, now()) mb_read = bytes_read / 1024.0 / 1024.0 # Stats for the current file: self.log("Read %d records %.2fMB in %.2fs (%.2fMB/s)" % (record_count, mb_read, duration, mb_read / duration)) except Exception, e: # Corrupted data, let's skip this record. self.log("Error reading raw data from {0} {1}\n{2}".format( raw_file, e, traceback.format_exc())) def write_bad_record(self, key, dims, data, error, message=None,
def run_mapper( self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket ): self.work_dir = work_dir print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete." bytes_total = sum([f.size for f in inputs]) bytes_completed = 0 next_notice_pct = 5 start = datetime.now() loader = None output_file = os.path.join(work_dir, "mapper_" + str(mapper_id)) mapfunc = getattr(module, "map", None) context = Context(output_file, partition_count) if not callable(mapfunc): print "No map function!!!" sys.exit(1) for input_file in inputs: if input_file.remote: # Lazy load the loader (so we don't do it on "local only" jobs). if loader is None: loader = s3util.Loader( os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1, ) for local, remote, err in loader.get_list([input_file.name]): if err is not None: print "Failed to download", remote, ":", err line_num = 0 full_filename = os.path.join(self.work_dir, "cache", input_file.name) for r, _ in heka_message.unpack_file(full_filename): msg = heka_message_parser.parse_heka_record(r) line_num += 1 try: mapfunc(msg["meta"]["documentId"], msg, context) except ValueError, e: # TODO: increment "bad line" metrics. print "Bad record:", input_file.name, ":", line_num, e if delete_files: os.remove(full_filename) bytes_completed += input_file.size completed_pct = (float(bytes_completed) / bytes_total) * 100 if completed_pct >= next_notice_pct: next_notice_pct += 5 duration_sec = timer.delta_sec(start) completed_mb = float(bytes_completed) / 1024.0 / 1024.0 print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % ( mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec, )
if should_run(args.dry_run, logger, "Deleting from source bucket"): k.delete() else: logger.info( "Not deleting source: either non-empty or same bucket: {}" .format(k.name)) if sql_update is None: logger.error("Missing sql_update :(") else: logger.info(sql_update) if should_run(args.dry_run, logger, "Notifying coordinator"): #TODO logger.debug("Should be actually notifying coordinator") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.info("Continuing from last seen key: {}".format(last_key)) total_mb = round(total_bytes / 1024.0 / 1024.0, 2) logger.info("Total bytes: {}".format(total_bytes)) logger.info("Overall, listed {} files ({} MB) in {} seconds.".format( total_count, total_mb, timer.delta_sec(start_time))) return 0 if __name__ == "__main__": sys.exit(main())
def main(): signal.signal(signal.SIGINT, handle_sigint) parser = argparse.ArgumentParser( description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-c", "--config", required=True, type=file, help="AWS Configuration file (json)") parser.add_argument("-w", "--work-dir", required=True, help="Location to cache downloaded files") parser.add_argument("-o", "--output-dir", required=True, help="Base dir to store processed data") parser.add_argument("-i", "--input-files", type=file, help="File containing a list of keys to process") parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file") parser.add_argument("-l", "--log-file", help="Log output to this file") parser.add_argument("-s", "--stats-file", help="Log statistics to this file") parser.add_argument("--histogram-cache-path", default="./histogram_cache", help="Path to store a local cache of histograms") parser.add_argument("-t", "--telemetry-schema", required=True, help="Location of the desired telemetry schema") parser.add_argument("-m", "--max-output-size", metavar="N", type=int, default=500000000, help="Rotate output files after N bytes") parser.add_argument("-D", "--dry-run", action="store_true", help="Don't modify remote files") parser.add_argument("-v", "--verbose", action="store_true", help="Print more detailed output") args = parser.parse_args() if args.verbose: # Turn on mp logging multiprocessing.log_to_stderr(logging.DEBUG) config = json.load(args.config) # TODO: allow commandline args to override config values. if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org") converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, args.max_output_size) logger = Log(args.log_file, "Master") num_cpus = multiprocessing.cpu_count() conn = None incoming_bucket = None incoming_queue = None s3downloader = None raw_readers = None compressors = None exporters = None done = False if not args.dry_run: # Set up AWS connections conn = S3Connection(config.get("aws_key", None), config.get( "aws_secret_key", None)) incoming_bucket = conn.get_bucket(config["incoming_bucket"]) q_conn = boto.sqs.connect_to_region(config.get("aws_region", None), aws_access_key_id=config.get("aws_key", None), aws_secret_access_key=config.get("aws_secret_key", None)) incoming_queue = q_conn.get_queue(config["incoming_queue"]) if incoming_queue is None: logger.log("Error: could not get queue " + config["incoming_queue"]) return -2 logger.log("Verifying that we can write to " + config["publish_bucket"]) try: publish_bucket = conn.get_bucket(config["publish_bucket"]) logger.log("Looks good!") except S3ResponseError: logger.log("Bucket {0} not found. Attempting to create it.".format( config["publish_bucket"])) publish_bucket = conn.create_bucket(config["publish_bucket"]) s3downloader = s3util.Loader(args.work_dir, config["incoming_bucket"], poolsize=num_cpus, aws_key=config.get("aws_key", None), aws_secret_key=config.get("aws_secret_key", None)) while not done: if args.dry_run: done = True try: start = now() incoming_filenames = [] incoming_queue_messages = [] logger.log("Fetching file list from queue " + config["incoming_queue"]) if args.dry_run: logger.log("Dry run mode... can't read from the queue " \ "without messing things up...") if args.input_files: logger.log("Fetching file list from file {}".format( args.input_files)) incoming_filenames = [ l.strip() for l in args.input_files.readlines() ] else: # Sometimes we don't get all the messages, even if more are # available, so keep trying until we have enough (or there # aren't any left) for i in range(num_cpus): messages = incoming_queue.get_messages(num_cpus - len(incoming_filenames)) for m in messages: # Make sure this file exists in S3 first possible_filename = m.get_body() key = incoming_bucket.get_key(possible_filename) if key is None: logger.log("Could not find queued filename in" \ " bucket {0}: {1}".format( config["incoming_bucket"], possible_filename)) # try to delete it: incoming_queue.delete_message(m) else: incoming_filenames.append(possible_filename) incoming_queue_messages.append(m) if len(messages) == 0 or len(incoming_filenames) >= num_cpus: break logger.log("Done") if len(incoming_filenames) == 0: logger.log("Nothing to do! Sleeping...") time.sleep(5) continue for f in incoming_filenames: logger.log(" " + f) before_download = now() logger.log("Downloading {0} files...".format(len(incoming_filenames))) local_filenames = [] download_stats = Stats("Downloader", args.stats_file, logger) if args.dry_run: logger.log("Dry run mode: skipping download from S3") local_filenames = [ os.path.join(args.work_dir, f) for f in incoming_filenames ] else: for local_filename, remote_filename, err in s3downloader.get_list(incoming_filenames): if err is None: local_filenames.append(local_filename) else: # s3downloader already retries 3 times. logger.log("Error downloading {0} Error: {1}".format( local_filename, err)) download_stats.increment( records_read=len(incoming_filenames), records_written=len(local_filenames), bad_records=1) download_stats.save() return 2 downloaded_bytes = sum([os.path.getsize(f) for f in local_filenames]) download_stats.increment(records_read=len(incoming_filenames), records_written=len(local_filenames), bytes_read=downloaded_bytes, bytes_written=downloaded_bytes) logger.log(download_stats.get_summary()) download_stats.save() after_download = now() raw_files = Queue() for l in local_filenames: raw_files.put(l) completed_files = Queue() # Begin reading raw input raw_readers = start_workers(logger, num_cpus, "Reader", ReadRawStep, raw_files, (completed_files, args.log_file, args.stats_file, schema, converter, storage, args.bad_data_log)) # Tell readers to stop when they get to the end: finish_queue(raw_files, num_cpus) # Compress completed files. compressors = start_workers(logger, num_cpus, "Compressor", CompressCompletedStep, completed_files, (None, args.log_file, args.stats_file)) wait_for(logger, raw_readers, "Raw Readers") # `find <out_dir> -type f -not -name ".compressme"` # Add them to completed_files for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(".log"): completed_files.put(os.path.join(root, f)) # Tell compressors to stop: finish_queue(completed_files, num_cpus) wait_for(logger, compressors, "Compressors") shutdown_requested = False try: # Export compressed files to S3. compressed_files = Queue() exporters = start_workers(logger, num_cpus, "Exporter", ExportCompressedStep, compressed_files, (args.log_file, args.stats_file, args.output_dir, config, args.dry_run)) for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(StorageLayout.COMPRESSED_SUFFIX): compressed_files.put(os.path.join(root, f)) finish_queue(compressed_files, num_cpus) wait_for(logger, exporters, "Exporters") except InterruptProcessingError, e: logger.log("Received shutdown request... waiting for " \ "exporters to finish") shutdown_requested = True shutdown_stats = Stats("ShutdownDuringExport", args.stats_file, logger) shutdown_stats.increment(records_read=1) shutdown_stats.save() done = True wait_for(logger, exporters, "Exporters") logger.log("OK, cleaning up") logger.log("Removing processed logs from S3...") for f in incoming_filenames: if args.dry_run: logger.log(" Dry run, so not really deleting " + f) else: logger.log(" Deleting " + f) incoming_bucket.delete_key(f) # Delete file locally too. os.remove(os.path.join(args.work_dir, f)) logger.log("Done") if len(incoming_queue_messages) > 0: logger.log("Removing processed messages from SQS...") for m in incoming_queue_messages: if args.dry_run: logger.log(" Dry run, so not really deleting " \ "{0}".format(m.get_body())) else: logger.log(" Deleting {0}".format(m.get_body())) if incoming_queue.delete_message(m): logger.log(" Message deleted successfully") else: logger.log(" Failed to delete message :(") logger.log("Done") if shutdown_requested: shutdown_stats.increment(records_written=1) shutdown_stats.save() all_done = now() duration = timer.delta_sec(start, all_done) logger.log("All done in %.2fs (%.2fs excluding download time)" % ( duration, timer.delta_sec(after_download, all_done))) except InterruptProcessingError, e: logger.log("Received normal shutdown request... quittin' time!") if raw_readers is not None: terminate(logger, raw_readers, "Readers") if compressors is not None: terminate(logger, compressors, "Compressors") if exporters is not None: terminate(logger, exporters, "Exporters") done = True
class ReadRawStep(PipeStep): def __init__(self, num, name, raw_files, completed_files, schema, converter, storage, bad_filename): self.schema = schema self.converter = converter self.storage = storage self.bad_filename = bad_filename PipeStep.__init__(self, num, name, raw_files, completed_files) def setup(self): self.expected_dim_count = len(self.schema._dimensions) def handle(self, raw_file): print self.label, "reading", raw_file try: record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack( raw_file): record_count += 1 self.records_read += 1 if err: print self.label, "ERROR: Found corrupted data for record", record_count, "in", raw_file, "path:", path, "Error:", err self.bad_records += 1 continue if len(data) == 0: print self.label, "ERROR: Found empty data for record", record_count, "in", raw_file, "path:", path self.bad_records += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") path = fileutil.to_unicode(path) #print "Path for record", record_count, path, "length of data:", len_data if data[0] != "{": # Data looks weird, should be JSON. print self.label, "Warning: Found unexpected data for record", record_count, "in", raw_file, "path:", path, "data:" print data else: # Raw JSON, make sure we treat it as unicode. data = fileutil.to_unicode(data) current_bytes = len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH[ "v1"] bytes_read += current_bytes self.bytes_read += current_bytes path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. print self.label, "Found an invalid path in record", record_count, path continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) try: # Convert data: if self.converter is None: serialized_data = data data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json( data, dims[-1]) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: print self.label, "Record", self.records_read, "mismatched dimension", i, dims[ i], "!=", parsed_dims[i] serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.bytes_written += len(key) + len( serialized_data) + 1 self.records_written += 1 # Compress rotated files as we generate them if n.endswith( StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:") except BadPayloadError, e: self.write_bad_record(key, dims, data, e.msg, "Bad Payload:") except Exception, e: err_message = str(e) # We don't need to write these bad records out - we know # why they are being skipped. if err_message != "Missing in payload: info.revision": # TODO: recognize other common failure modes and handle them gracefully. self.write_bad_record(key, dims, data, err_message, "Conversion Error:") traceback.print_exc() if self.print_stats: this_update = datetime.now() sec = timer.delta_sec(self.last_update, this_update) if sec > 10.0: self.last_update = this_update self.end_time = datetime.now() self.dump_stats()
def main(): parser = argparse.ArgumentParser( description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files") parser.add_argument("publish_bucket", help="The S3 bucket to save processed files") parser.add_argument("-n", "--num-helpers", metavar="N", help="Start N helper processes", type=int, default=1) parser.add_argument("-k", "--aws-key", help="AWS Key", required=True) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True) parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True) parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True) parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file) parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True) args = parser.parse_args() # TODO: keep track of partial success so that subsequent runs are idempotent. start = datetime.now() conn = S3Connection(args.aws_key, args.aws_secret_key) incoming_bucket = conn.get_bucket(args.incoming_bucket) incoming_filenames = [] if args.input_files: print "Fetching file list from file", args.input_files incoming_filenames = [l.strip() for l in args.input_files.readlines()] else: print "Fetching file list from S3..." for f in incoming_bucket.list(): incoming_filenames.append(f.name) print "Done" for f in incoming_filenames: print " ", f result = 0 print "Downloading", len(incoming_filenames), "files..." result = fetch_s3_files(incoming_filenames, args.work_dir, args.incoming_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error downloading files. Return code of s3funnel was", result return result print "Done" print "Splitting raw logs..." local_filenames = [ os.path.join(args.work_dir, f) for f in incoming_filenames ] result = split_raw_logs(local_filenames, args.output_dir, args.telemetry_schema) if result != 0: print "Error splitting logs. Return code was", result return result print "Done" print "Converting split logs..." result = convert_split_logs(args.output_dir) if result != 0: print "Error converting logs. Return code was", result return result print "Done" print "Exporting converted logs back to S3..." result = export_converted_logs(args.output_dir, args.publish_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error exporting logs. Return code was", result return result print "Done" print "Removing processed logs from S3..." for f in incoming_filenames: print " Deleting", f incoming_bucket.delete_key(f) print "Done" duration = timer.delta_sec(start) print "All done in %.2fs" % (duration) return 0
def main(): args = get_args() logging.basicConfig() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.info("Expiring `flash_video` data older than {}.".format(args.expiry_date)) logger.debug("Connecting to S3...") conn = S3Connection(args.aws_key, args.aws_secret_key) bucket = conn.get_bucket(args.bucket) connection_string = "" if hasattr(args, "db_name"): connection_string += "dbname={0} ".format(args.db_name) if hasattr(args, "db_host"): connection_string += "host={0} ".format(args.db_host) if hasattr(args, "db_port"): connection_string += "port={0} ".format(args.db_port) if hasattr(args, "db_user"): connection_string += "user={0} ".format(args.db_user) if hasattr(args, "db_pass"): connection_string += "password={0} ".format(args.db_pass) db_conn = None db_cursor = None if should_run(args.dry_run, logger, "Connecting to database"): db_conn = psycopg2.connect(connection_string) db_cursor = db_conn.cursor() prefix = args.prefix last_key = '' done = False total_count = 0 exp_count = 0 total_bytes = 0 start_time = datetime.now() while not done: try: for k in bucket.list(prefix=prefix, marker=last_key): if k.name.endswith('/'): logger.debug("Skipping directory '{}'".format(k.name)) continue total_count += 1 if not should_expire(k.name, args.expiry_date, logger): continue exp_count += 1 total_bytes += k.size last_key = k.name if total_count % 100 == 0: logger.debug("Expired {} of {} total files in {}s. Last key was {}".format( exp_count, total_count, timer.delta_sec(start_time), last_key)) logger.info("Deleting {} from S3 bucket".format(k.name)) sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name) if should_run(args.dry_run, logger, "Deleting from S3 bucket"): k.delete() if should_run(args.dry_run, logger, "Notifying coordinator"): db_cursor.execute(sql_update) db_conn.commit() logger.debug("Coordinator notified") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.debug("Continuing from last seen key: {}".format(last_key))
def main(): parser = argparse.ArgumentParser(description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files") parser.add_argument("publish_bucket", help="The S3 bucket to save processed files") parser.add_argument("-n", "--num-helpers", metavar="N", help="Start N helper processes", type=int, default=1) parser.add_argument("-k", "--aws-key", help="AWS Key", required=True) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True) parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True) parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True) parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file) parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True) args = parser.parse_args() # TODO: keep track of partial success so that subsequent runs are idempotent. start = datetime.now() conn = S3Connection(args.aws_key, args.aws_secret_key) incoming_bucket = conn.get_bucket(args.incoming_bucket) incoming_filenames = [] if args.input_files: print "Fetching file list from file", args.input_files incoming_filenames = [ l.strip() for l in args.input_files.readlines() ] else: print "Fetching file list from S3..." for f in incoming_bucket.list(): incoming_filenames.append(f.name) print "Done" for f in incoming_filenames: print " ", f result = 0 print "Downloading", len(incoming_filenames), "files..." result = fetch_s3_files(incoming_filenames, args.work_dir, args.incoming_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error downloading files. Return code of s3funnel was", result return result print "Done" print "Splitting raw logs..." local_filenames = [os.path.join(args.work_dir, f) for f in incoming_filenames] result = split_raw_logs(local_filenames, args.output_dir, args.telemetry_schema) if result != 0: print "Error splitting logs. Return code was", result return result print "Done" print "Converting split logs..." result = convert_split_logs(args.output_dir) if result != 0: print "Error converting logs. Return code was", result return result print "Done" print "Exporting converted logs back to S3..." result = export_converted_logs(args.output_dir, args.publish_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error exporting logs. Return code was", result return result print "Done" print "Removing processed logs from S3..." for f in incoming_filenames: print " Deleting", f incoming_bucket.delete_key(f) print "Done" duration = timer.delta_sec(start) print "All done in %.2fs" % (duration) return 0
def main(): parser = argparse.ArgumentParser(description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files") parser.add_argument("publish_bucket", help="The S3 bucket to save processed files") parser.add_argument("-k", "--aws-key", help="AWS Key", required=True) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True) parser.add_argument("-r", "--aws-region", help="AWS Region", default="us-west-2") parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True) parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True) parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file) parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file") parser.add_argument("-q", "--queue", help="SQS Queue name to poll for incoming data") parser.add_argument("-c", "--histogram-cache-path", help="Path to store a local cache of histograms", default="./histogram_cache") parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-D", "--dry-run", help="Don't modify remote files", action="store_true") parser.add_argument("-C", "--skip-conversion", help="Skip validation/conversion of payloads", action="store_true") args = parser.parse_args() if not os.path.isfile(S3FUNNEL_PATH): print "ERROR: s3funnel not found at", S3FUNNEL_PATH print "You can get it from github: https://github.com/sstoiana/s3funnel" return -1 if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org") if args.skip_conversion: converter = None else: converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, args.max_output_size) num_cpus = multiprocessing.cpu_count() start = datetime.now() conn = None incoming_bucket = None incoming_queue = None incoming_queue_messages = [] if not args.dry_run: conn = S3Connection(args.aws_key, args.aws_secret_key) incoming_bucket = conn.get_bucket(args.incoming_bucket) incoming_filenames = [] if args.queue is not None: print "Fetching file list from queue", args.queue if args.dry_run: print "Dry run mode... can't read from the queue without messing things up..." else: q_conn = boto.sqs.connect_to_region(args.aws_region, aws_access_key_id=args.aws_key, aws_secret_access_key=args.aws_secret_key) incoming_queue = q_conn.get_queue(args.queue) if incoming_queue is None: print "Error: could not get queue", args.queue return -2 # Sometimes we don't get all the messages, even if more are # available, so keep trying until we have enough (or there aren't # any left) for i in range(num_cpus): messages = incoming_queue.get_messages(num_cpus - len(incoming_filenames)) for m in messages: # TODO: Make sure this file exists in S3 first? possible_filename = m.get_body() key = incoming_bucket.get_key(possible_filename) if key is None: print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename # try to delete it: incoming_queue.delete_message(m) else: incoming_filenames.append(possible_filename) incoming_queue_messages.append(m) if len(messages) == 0 or len(incoming_filenames) >= num_cpus: break elif args.input_files: print "Fetching file list from file", args.input_files incoming_filenames = [ l.strip() for l in args.input_files.readlines() ] else: print "Fetching file list from S3..." for f in incoming_bucket.list(): incoming_filenames.append(f.name) print "Done" if len(incoming_filenames) == 0: print "Nothing to do!" return 0 for f in incoming_filenames: print " ", f print "Verifying that we can write to", args.publish_bucket if args.dry_run: print "Dry run mode: don't care!" else: try: publish_bucket = conn.get_bucket(args.publish_bucket) print "Looks good!" except S3ResponseError: print "Bucket", args.publish_bucket, "not found. Attempting to create it." publish_bucket = conn.create_bucket(args.publish_bucket) result = 0 print "Downloading", len(incoming_filenames), "files..." if args.dry_run: print "Dry run mode: skipping download from S3" else: result = fetch_s3_files(incoming_filenames, args.work_dir, incoming_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error downloading files. Return code of s3funnel was", result return result print "Done" after_download = datetime.now() local_filenames = [os.path.join(args.work_dir, f) for f in incoming_filenames] # TODO: try a SimpleQueue raw_files = Queue() for l in local_filenames: raw_files.put(l) completed_files = Queue() compressed_files = Queue() # Begin reading raw input raw_readers = start_workers(num_cpus, "Reader", ReadRawStep, raw_files, (completed_files, schema, converter, storage, args.bad_data_log)) # Tell readers when to stop: for i in range(num_cpus): raw_files.put(PipeStep.SENTINEL) # Compress completed files. compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep, completed_files, (compressed_files,)) # Export compressed files to S3. exporters = start_workers(num_cpus, "Exporter", ExportCompressedStep, compressed_files, (args.output_dir, args.aws_key, args.aws_secret_key, args.publish_bucket, args.dry_run)) wait_for(raw_readers, "Raw Readers") # `find <out_dir> -type f -not -name ".compressme"` # Add them to completed_files for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(".log"): completed_files.put(os.path.join(root, f)) for i in range(num_cpus): completed_files.put(PipeStep.SENTINEL) wait_for(compressors, "Compressors") for i in range(num_cpus): compressed_files.put(PipeStep.SENTINEL) wait_for(exporters, "Exporters") print "Removing processed logs from S3..." for f in incoming_filenames: if args.dry_run: print " Dry run, so not really deleting", f else: print " Deleting", f incoming_bucket.delete_key(f) # Delete file locally too. os.remove(os.path.join(args.work_dir, f)) print "Done" if len(incoming_queue_messages) > 0: print "Removing processed messages from SQS..." for m in incoming_queue_messages: if args.dry_run: print " Dry run, so not really deleting", m.get_body() else: print " Deleting", m.get_body() if incoming_queue.delete_message(m): print " Message deleted successfully" else: print " Failed to delete message :(" print "Done" duration = timer.delta_sec(start) print "All done in %.2fs (%.2fs excluding download time)" % (duration, timer.delta_sec(after_download)) return 0
if empty_result or args.source_bucket != args.dest_bucket: if should_run(args.dry_run, logger, "Deleting from source bucket"): k.delete() else: logger.info("Not deleting source: either non-empty or same bucket: {}".format(k.name)) if sql_update is None: logger.error("Missing sql_update :(") else: logger.info(sql_update) if should_run(args.dry_run, logger, "Notifying coordinator"): # TODO logger.debug("Should be actually notifying coordinator") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.info("Continuing from last seen key: {}".format(last_key)) total_mb = round(total_bytes / 1024.0 / 1024.0, 2) logger.info("Total bytes: {}".format(total_bytes)) logger.info( "Overall, listed {} files ({} MB) in {} seconds.".format(total_count, total_mb, timer.delta_sec(start_time)) ) return 0 if __name__ == "__main__": sys.exit(main())
def main(): args = get_args() logging.basicConfig() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.WARNING) if not os.path.exists(args.work_dir): os.makedirs(args.work_dir) logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format( args.source_bucket, args.dest_bucket)) logger.debug("Connecting to S3...") conn = S3Connection(args.aws_key, args.aws_secret_key) source_bucket = conn.get_bucket(args.source_bucket) dest_bucket = conn.get_bucket(args.dest_bucket) compress_cmd = [StorageLayout.COMPRESS_PATH ] + StorageLayout.COMPRESSION_ARGS prefix = args.prefix last_key = '' done = False total_count = 0 total_bytes = 0 start_time = datetime.now() dupe_map = {} while not done: try: for k in source_bucket.list(prefix=prefix, marker=last_key): if k.name.endswith('/'): logger.debug("Skipping directory '{}'".format(k.name)) continue if skip_by_date(k.name, args.min_date, logger): logger.debug("Skipping file older than {}: {}".format( args.min_date, k.name)) continue total_count += 1 total_bytes += k.size last_key = k.name if total_count % 100 == 0: logger.info( "Looked at {} total records in {} seconds. Last key was {}" .format(total_count, timer.delta_sec(start_time), last_key)) logger.debug("Fetching {} from source bucket".format(k.name)) full_source_filename = os.path.join(args.work_dir, "__source", k.name) full_dest_filename = os.path.join(args.work_dir, "__dest", k.name) # Ensure that the necessary local dirs exist: for f in [full_source_filename, full_dest_filename]: dirname = os.path.dirname(f) if dirname != '' and not os.path.exists(dirname): os.makedirs(dirname) logger.debug("Getting '{}' to '{}'".format( k.name, full_source_filename)) k.get_contents_to_filename(full_source_filename) logger.info("Removing pingIDs...") tmp_out_file = full_dest_filename + ".tmp" out_handle = open(tmp_out_file, "w") logger.debug("Uncompressing...") if full_source_filename.endswith( StorageLayout.COMPRESSED_SUFFIX): decompress_cmd = [StorageLayout.COMPRESS_PATH ] + StorageLayout.DECOMPRESSION_ARGS raw_handle = open(full_source_filename, "rb") # Popen the decompressing version of StorageLayout.COMPRESS_PATH p_decompress = Popen(decompress_cmd, bufsize=65536, stdin=raw_handle, stdout=PIPE, stderr=sys.stderr) handle = p_decompress.stdout else: handle = open(full_source_filename, "r") raw_handle = None logger.debug("Generating new pingIDs...") for line in handle: # Lines are of the form <key><tab><json payload><newline>. # Split on tab character to get the pieces. key, payload = line.split(u"\t", 1) # Replace key with a fresh UUID: if key in dupe_map: logger.info( "Already saw key {}, skipping any more occurrences" .format(key)) else: new_key = str(uuid4()) dupe_map[key] = new_key out_handle.write(u"%s\t%s" % (new_key, payload)) handle.close() out_handle.close() if raw_handle: raw_handle.close() sql_update = None empty_result = False if os.stat(tmp_out_file).st_size > 0: logger.debug("Compressing new file...") f_comp = open(full_dest_filename, "wb") f_raw = open(tmp_out_file, "r", 1) p_compress = Popen(compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr) p_compress.communicate() f_raw.close() f_comp.close() local_md5, size = fu.md5file(full_dest_filename) sql_update = "UPDATE published_files SET " \ "file_md5 = '{0}', " \ "file_size = {1}, " \ "bucket_name = '{2}' " \ "WHERE file_name = '{3}';".format(local_md5, size, dest_bucket.name, k.name) else: # Don't upload empty files. empty_result = True sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format( k.name) logger.debug( "File was empty, skipping: {}".format(tmp_out_file)) logger.info( "Removing temp output file: {}".format(tmp_out_file)) os.remove(tmp_out_file) if not empty_result and should_run(args.dry_run, logger, "Uploading to dest bucket"): dest_key = dest_bucket.new_key(k.name) dest_key.set_contents_from_filename(full_dest_filename) # Compare the md5 to be sure it succeeded. dest_md5 = dest_key.etag[1:-1] local_md5, size = fu.md5file(full_dest_filename) if dest_md5 != local_md5: raise Exception( "Failed to upload {}".format(full_dest_filename)) if should_run( args.dry_run, logger, "Removing input file: {}".format( full_source_filename)): os.remove(full_source_filename) if not empty_result and should_run( args.dry_run, logger, "Removing output file: {}".format(full_dest_filename)): os.remove(full_dest_filename) if empty_result or args.source_bucket != args.dest_bucket: if should_run(args.dry_run, logger, "Deleting from source bucket"): k.delete() else: logger.info( "Not deleting source: either non-empty or same bucket: {}" .format(k.name)) if sql_update is None: logger.error("Missing sql_update :(") else: logger.info(sql_update) if should_run(args.dry_run, logger, "Notifying coordinator"): #TODO logger.debug("Should be actually notifying coordinator") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.info("Continuing from last seen key: {}".format(last_key))
def main(): args = get_args() logging.basicConfig() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.WARNING) if not os.path.exists(args.work_dir): os.makedirs(args.work_dir) logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format(args.source_bucket, args.dest_bucket)) logger.debug("Connecting to S3...") conn = S3Connection(args.aws_key, args.aws_secret_key) source_bucket = conn.get_bucket(args.source_bucket) dest_bucket = conn.get_bucket(args.dest_bucket) compress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.COMPRESSION_ARGS prefix = args.prefix last_key = "" done = False total_count = 0 total_bytes = 0 start_time = datetime.now() dupe_map = {} while not done: try: for k in source_bucket.list(prefix=prefix, marker=last_key): if k.name.endswith("/"): logger.debug("Skipping directory '{}'".format(k.name)) continue if skip_by_date(k.name, args.min_date, logger): logger.debug("Skipping file older than {}: {}".format(args.min_date, k.name)) continue total_count += 1 total_bytes += k.size last_key = k.name if total_count % 100 == 0: logger.info( "Looked at {} total records in {} seconds. Last key was {}".format( total_count, timer.delta_sec(start_time), last_key ) ) logger.debug("Fetching {} from source bucket".format(k.name)) full_source_filename = os.path.join(args.work_dir, "__source", k.name) full_dest_filename = os.path.join(args.work_dir, "__dest", k.name) # Ensure that the necessary local dirs exist: for f in [full_source_filename, full_dest_filename]: dirname = os.path.dirname(f) if dirname != "" and not os.path.exists(dirname): os.makedirs(dirname) logger.debug("Getting '{}' to '{}'".format(k.name, full_source_filename)) k.get_contents_to_filename(full_source_filename) logger.info("Removing pingIDs...") tmp_out_file = full_dest_filename + ".tmp" out_handle = open(tmp_out_file, "w") logger.debug("Uncompressing...") if full_source_filename.endswith(StorageLayout.COMPRESSED_SUFFIX): decompress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.DECOMPRESSION_ARGS raw_handle = open(full_source_filename, "rb") # Popen the decompressing version of StorageLayout.COMPRESS_PATH p_decompress = Popen( decompress_cmd, bufsize=65536, stdin=raw_handle, stdout=PIPE, stderr=sys.stderr ) handle = p_decompress.stdout else: handle = open(full_source_filename, "r") raw_handle = None logger.debug("Generating new pingIDs...") for line in handle: # Lines are of the form <key><tab><json payload><newline>. # Split on tab character to get the pieces. key, payload = line.split(u"\t", 1) # Replace key with a fresh UUID: if key in dupe_map: logger.info("Already saw key {}, skipping any more occurrences".format(key)) else: new_key = str(uuid4()) dupe_map[key] = new_key out_handle.write(u"%s\t%s" % (new_key, payload)) handle.close() out_handle.close() if raw_handle: raw_handle.close() sql_update = None empty_result = False if os.stat(tmp_out_file).st_size > 0: logger.debug("Compressing new file...") f_comp = open(full_dest_filename, "wb") f_raw = open(tmp_out_file, "r", 1) p_compress = Popen(compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr) p_compress.communicate() f_raw.close() f_comp.close() local_md5, size = fu.md5file(full_dest_filename) sql_update = ( "UPDATE published_files SET " "file_md5 = '{0}', " "file_size = {1}, " "bucket_name = '{2}' " "WHERE file_name = '{3}';".format(local_md5, size, dest_bucket.name, k.name) ) else: # Don't upload empty files. empty_result = True sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name) logger.debug("File was empty, skipping: {}".format(tmp_out_file)) logger.info("Removing temp output file: {}".format(tmp_out_file)) os.remove(tmp_out_file) if not empty_result and should_run(args.dry_run, logger, "Uploading to dest bucket"): dest_key = dest_bucket.new_key(k.name) dest_key.set_contents_from_filename(full_dest_filename) # Compare the md5 to be sure it succeeded. dest_md5 = dest_key.etag[1:-1] local_md5, size = fu.md5file(full_dest_filename) if dest_md5 != local_md5: raise Exception("Failed to upload {}".format(full_dest_filename)) if should_run(args.dry_run, logger, "Removing input file: {}".format(full_source_filename)): os.remove(full_source_filename) if not empty_result and should_run( args.dry_run, logger, "Removing output file: {}".format(full_dest_filename) ): os.remove(full_dest_filename) if empty_result or args.source_bucket != args.dest_bucket: if should_run(args.dry_run, logger, "Deleting from source bucket"): k.delete() else: logger.info("Not deleting source: either non-empty or same bucket: {}".format(k.name)) if sql_update is None: logger.error("Missing sql_update :(") else: logger.info(sql_update) if should_run(args.dry_run, logger, "Notifying coordinator"): # TODO logger.debug("Should be actually notifying coordinator") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.info("Continuing from last seen key: {}".format(last_key))