def export_batch(self, data_dir, conn, bucket, files):
        print self.label, "Uploading", ",".join(files)
        if self.dry_run:
            return 0

        # Time the s3funnel call:
        start = datetime.now()
        result = subprocess.call(self.s3f_cmd + files, cwd=data_dir)
        sec = timer.delta_sec(start)

        total_size = 0
        if result == 0:
            # Success! Verify each file's checksum, then truncate it.
            for f in files:
                # Verify checksum and track cumulative size so we can figure out MB/s
                full_filename = os.path.join(data_dir, f)
                md5, size = fileutil.md5file(full_filename)
                total_size += size
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    # TODO: add it to a "failed" queue.
                    print "ERROR: %s failed checksum verification: Local=%s, Remote=%s" % (f, md5, remote_md5)
                    self.bad_records += 1
                    result = -1
                # TODO: else add it to a "succeeded" queue and remove it locally.
        else:
            print "Failed to upload one or more files in the current batch. Error code was", result

        total_mb = float(total_size) / 1024.0 / 1024.0
        print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec)
        return result
def list_files(bucket_name,
               output_file,
               output_func=s3obj_to_string,
               prefix=''):
    s3 = S3Connection()
    bucket = s3.get_bucket(bucket_name)
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    while not done:
        try:
            for k in bucket.list(prefix=prefix, marker=last_key):
                last_key = k.name
                total_count += 1
                if total_count % 5000 == 0:
                    print "Looked at", total_count, "total records in", timer.delta_sec(
                        start_time), "seconds. Last key was", last_key
                try:
                    output_file.write(str(output_func(k)) + "\n")
                except Exception, e:
                    print "Error writing key", k.name, ":", e
                    traceback.print_exc()
            done = True
        except socket.error, e:
            print "Error listing keys:", e
            traceback.print_exc()
            print "Continuing from last seen key:", last_key
示例#3
0
    def fetch_remotes(self, remotes):
        # TODO: fetch remotes inside Mappers, and process each one as it becomes available.
        remote_names = [ r["name"] for r in remotes if r["type"] == "remote" ]

        # TODO: check cache first.
        result = 0
        if len(remote_names) == 0:
            return result

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key)
        start = datetime.now()
        downloaded_bytes = 0
        for local, remote, err in loader.get_list(remote_names):
            if err is None:
                print "Downloaded", remote
                downloaded_bytes += os.path.getsize(local)
            else:
                print "Failed to download", remote
                result += 1
        duration_sec = timer.delta_sec(start)
        downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result
def run_mr(filter, output_file, local_only, streaming):

    args = {
        "job_script": "../bucketless_uitour.py",
        "input_filter": filter,
        "num_mappers": 16,
        "num_reducers": 4,
        "data_dir": "../work/cache",
        "work_dir": "../work",
        "output": output_file,
        "bucket": "telemetry-published-v2",
        "local_only": local_only,
        "delete_data": streaming
    }

    if not args["local_only"]:
        if not BOTO_AVAILABLE:
            print "ERROR: The 'boto' library is required except in 'local-only' mode."
            print "       You can install it using `sudo pip install boto`"
            parser.print_help()
            return -2

    job = Job(args)
    start = datetime.now()
    exit_code = 0
    try:
        job.mapreduce()
    except:
        traceback.print_exc(file=sys.stderr)
        exit_code = 2
    duration = timer.delta_sec(start)
    print "All done in %.2fs" % (duration)
    return (exit_code, output_file)
示例#5
0
    def export(self, uploadables):
        if len(uploadables) == 0:
            print "Nothing to do!"
            return 0
        print "Found", len(uploadables), "files"

        fail_count = 0
        start = datetime.now()
        total_size = 0
        for local, remote, err in self.s3loader.put_list(uploadables):
            if err is None:
                # Great Success! Delete it locally.
                total_size += os.path.getsize(local)
                if self.keep_backups:
                    # Keep a copy of the original, just in case.
                    os.rename(local, local + ".uploaded")
                else:
                    os.remove(local)
                # Send a message to SQS
                # TODO: verify that it succeeded.
                self.enqueue_incoming(remote)

            else:
                fail_count += 1
                print "Failed to upload '{0}' to bucket {1} as '{2}':".format(
                    local, self.bucket, remote), err
        sec = timer.delta_sec(start)
        total_mb = float(total_size) / 1024.0 / 1024.0
        print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec,
                                                          total_mb / sec)
        # TODO: log the transfer stats properly.

        # Return zero for overall success or the number of failures.
        return fail_count
示例#6
0
def main():
    parser = argparse.ArgumentParser(description='Run a MapReduce Job.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("job_script", help="The MapReduce script to run")
    parser.add_argument("-l", "--local-only", help="Only process local files (exclude S3 data)", action="store_true")
    parser.add_argument("-m", "--num-mappers", metavar="N", help="Start N mapper processes", type=int, default=4)
    parser.add_argument("-r", "--num-reducers", metavar="N", help="Start N reducer processes", type=int, default=1)
    parser.add_argument("-d", "--data-dir", help="Base data directory", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key", default=None)
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", default=None)
    parser.add_argument("-w", "--work-dir", help="Location to put temporary work files", default="/tmp/telemetry_mr")
    parser.add_argument("-o", "--output", help="Filename to use for final job output", required=True)
    #TODO: make the input filter optional, default to "everything valid" and generate dims intelligently.
    parser.add_argument("-f", "--input-filter", help="File containing filter spec", required=True)
    parser.add_argument("-v", "--verbose", help="Print verbose output", action="store_true")
    args = parser.parse_args()

    if not args.local_only:
        if not BOTO_AVAILABLE:
            print "ERROR: The 'boto' library is required except in 'local-only' mode."
            print "       You can install it using `sudo pip install boto`"
            parser.print_help()
            sys.exit(-2)
        # If we want to process remote data, some more arguments are required.
        for remote_req in ["bucket"]:
            if not hasattr(args, remote_req) or getattr(args, remote_req) is None:
                print "ERROR:", remote_req, "is a required option"
                parser.print_help()
                sys.exit(-1)

    job = Job(args)
    start = datetime.now()
    job.mapreduce()
    duration = timer.delta_sec(start)
    print "All done in %.2fs" % (duration)
示例#7
0
 def get_filtered_files_s3(self):
     out_files = []
     if not self._local_only:
         print "Fetching file list from S3..."
         # Plain boto should be fast enough to list bucket contents.
         if self._aws_key is not None:
             conn = S3Connection(self._aws_key, self._aws_secret_key)
         else:
             conn = S3Connection()
         bucket = conn.get_bucket(self._bucket_name)
         start = datetime.now()
         count = 0
         # Filter input files by partition. If the filter is reasonably
         # selective, this can be much faster than listing all files in the
         # bucket.
         for f in s3util.list_partitions(bucket,
                                         schema=self._input_filter,
                                         include_keys=True):
             count += 1
             out_files.append(f)
             if count == 1 or count % 1000 == 0:
                 print "Listed", count, "so far"
         conn.close()
         duration = timer.delta_sec(start)
         print "Listed", len(out_files), "files in", duration, "seconds"
     return out_files
示例#8
0
class ExportCompressedStep(PipeStep):
    def __init__(self, num, name, q_in, log_file, stats_file,
            base_dir, config, dry_run):
        self.dry_run = dry_run
        self.base_dir = base_dir
        self.aws_key = config.get("aws_key", None)
        self.aws_secret_key = config.get("aws_secret_key", None)
        self.aws_bucket_name = config["publish_bucket"]
        PipeStep.__init__(self, num, name, q_in, log_file=log_file,
                stats_file=stats_file)

    def setup(self):
        if self.dry_run:
            self.conn = None
            self.bucket = None
            return
        self.conn = S3Connection(self.aws_key, self.aws_secret_key)
        self.bucket = self.conn.get_bucket(self.aws_bucket_name)

    def strip_data_dir(self, data_dir, full_file):
        if full_file.startswith(data_dir):
            chopped = full_file[len(data_dir):]
            if chopped[0] == "/":
                chopped = chopped[1:]
            return chopped
        else:
            raise ValueError("Invalid full filename: " + str(full_file))

    def handle(self, record):
        try:
            # Remove the output dir prefix from filenames
            stripped_name = self.strip_data_dir(self.base_dir, record)
        except ValueError, e:
            self.log("Warning: couldn't strip base dir from '{0}' " \
                     "{1}".format(record, e))
            stripped_name = record

        self.log("Uploading {0}".format(stripped_name))
        start = now()
        if self.dry_run:
            local_filename = record
            remote_filename = stripped_name
            err = None
        else:
            local_filename, remote_filename, err = s3util.upload_one([
                    self.base_dir, self.bucket, stripped_name])
        sec = timer.delta_sec(start, now())
        current_size = os.path.getsize(record)
        self.stats.increment(records_read=1, bytes_read=current_size)
        if err is None:
            # Everything went well.
            self.stats.increment(records_written=1, bytes_written=current_size)
            # Delete local files once they've been uploaded successfully.
            if not self.dry_run:
                try:
                    os.remove(record)
                    self.log("Removed uploaded file {0}".format(record))
                except Exception, e:
                    self.log("Failed to remove uploaded file {0}: " \
                             "{1}".format(record, e))
示例#9
0
    def fetch_remotes(self, remotes):
        # TODO: fetch remotes inside Mappers, and process each one as it becomes available.
        remote_names = [r["name"] for r in remotes if r["type"] == "remote"]

        # TODO: check cache first.
        result = 0
        if len(remote_names) == 0:
            return result

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        loader = s3util.Loader(fetch_cwd,
                               self._bucket_name,
                               aws_key=self._aws_key,
                               aws_secret_key=self._aws_secret_key)
        start = datetime.now()
        downloaded_bytes = 0
        for local, remote, err in loader.get_list(remote_names):
            if err is None:
                print "Downloaded", remote
                downloaded_bytes += os.path.getsize(local)
            else:
                print "Failed to download", remote
                result += 1
        duration_sec = timer.delta_sec(start)
        downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
            downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result
示例#10
0
    def export(self, uploadables):
        if len(uploadables) == 0:
            print "Nothing to do!"
            return 0
        print "Found", len(uploadables), "files"

        fail_count = 0
        start = datetime.now()
        total_size = 0
        for local, remote, err in self.s3loader.put_list(uploadables):
            if err is None:
                # Great Success! Delete it locally.
                total_size += os.path.getsize(local)
                if self.keep_backups:
                    # Keep a copy of the original, just in case.
                    os.rename(local, local + ".uploaded")
                else:
                    os.remove(local)
                # Send a message to SQS
                # TODO: verify that it succeeded.
                self.enqueue_incoming(remote)

            else:
                fail_count += 1
                print "Failed to upload '{0}' to bucket {1} as '{2}':".format(local, self.bucket, remote), err
        sec = timer.delta_sec(start)
        total_mb = float(total_size) / 1024.0 / 1024.0
        print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec)
        # TODO: log the transfer stats properly.

        # Return zero for overall success or the number of failures.
        return fail_count
def run_mr(filter, output_file, local_only):

  args = {
    "job_script" : "../uitour.py",
    "input_filter": filter,
    "num_mappers" : 16,
    "num_reducers" : 4,
    "data_dir" : "../work/cache",
    "work_dir" : "../work",
    "output" : output_file,
    "bucket" : "telemetry-published-v2",
    "local_only" : local_only
  }

  if not args["local_only"]:
      if not BOTO_AVAILABLE:
          print "ERROR: The 'boto' library is required except in 'local-only' mode."
          print "       You can install it using `sudo pip install boto`"
          parser.print_help()
          return -2

  job = Job(args)
  start = datetime.now()
  exit_code = 0
  try:
      job.mapreduce()
  except:
      traceback.print_exc(file=sys.stderr)
      exit_code = 2
  duration = timer.delta_sec(start)
  print "All done in %.2fs" % (duration)
  return (exit_code, output_file)
    def export_batch(self, data_dir, conn, bucket, files):
        print self.label, "Uploading", ",".join(files)
        if self.dry_run:
            return 0

        # Time the s3funnel call:
        start = datetime.now()
        result = subprocess.call(self.s3f_cmd + files, cwd=data_dir)
        sec = timer.delta_sec(start)

        total_size = 0
        if result == 0:
            # Success! Verify each file's checksum, then truncate it.
            for f in files:
                # Verify checksum and track cumulative size so we can figure out MB/s
                full_filename = os.path.join(data_dir, f)
                md5, size = fileutil.md5file(full_filename)
                total_size += size
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    # TODO: add it to a "failed" queue.
                    print "ERROR: %s failed checksum verification: Local=%s, Remote=%s" % (f, md5, remote_md5)
                    self.bad_records += 1
                    result = -1
                # TODO: else add it to a "succeeded" queue and remove it locally.
        else:
            print "Failed to upload one or more files in the current batch. Error code was", result

        total_mb = float(total_size) / 1024.0 / 1024.0
        print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec)
        return result
示例#13
0
    def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket):
        self.work_dir = work_dir

        print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete."

        bytes_total = sum([f.size for f in inputs])
        bytes_completed = 0
        next_notice_pct = 10

        start = datetime.now()

        loader = None
        output_file = os.path.join(work_dir, "mapper_" + str(mapper_id))
        mapfunc = getattr(module, 'map', None)
        context = Context(output_file, partition_count)
        if not callable(mapfunc):
            print "No map function!!!"
            sys.exit(1)

        # TODO: Stream/decompress the files directly.
        for input_file in inputs:
            if input_file.remote:
                # TODO: check if the file already exists locally.
                # Lazy load the loader (so we don't do it on "local only" jobs).
                if loader is None:
                    loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1)
                for local, remote, err in loader.get_list([input_file.name]):
                    if err is not None:
                        print "Failed to download", remote, ":", err

            try:
                handle = self.open_input_file(input_file)
            except:
                print "Error opening", input_file.name, "(skipping)"
                traceback.print_exc(file=sys.stderr)
                continue
            line_num = 0
            for line in handle:
                line_num += 1
                try:
                    # Remove the trailing EOL character(s) before passing to
                    # the map function.
                    key, value = line.rstrip('\r\n').split("\t", 1)
                    mapfunc(key, input_file.dimensions, value, context)
                except ValueError, e:
                    # TODO: increment "bad line" metrics.
                    print "Bad line:", input_file.name, ":", line_num, e
            handle.close()
            if delete_files:
                print "Removing", input_file.name
                os.remove(handle.filename)
            bytes_completed += input_file.size
            completed_pct = (float(bytes_completed) / bytes_total) * 100
            if completed_pct >= next_notice_pct:
                next_notice_pct += 10
                duration_sec = timer.delta_sec(start)
                completed_mb = float(bytes_completed) / 1024.0 / 1024.0
                print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
 def dump_stats(self):
     duration = timer.delta_sec(self.start_time, self.end_time)
     read_rate = self.records_read / duration
     mb_read = self.bytes_read / 1024.0 / 1024.0
     mb_read_rate = mb_read / duration
     write_rate = self.records_written / duration
     mb_written = self.bytes_written / 1024.0 / 1024.0
     mb_write_rate = mb_written / duration
     print "%s: Read %d records or %.2fMB (%.2fr/s, %.2fMB/s), wrote %d or %.2f MB (%.2fr/s, %.2fMB/s). Found %d bad records" % (self.label, self.records_read, mb_read, read_rate, mb_read_rate, self.records_written, mb_written, write_rate, mb_write_rate, self.bad_records)
 def dump_stats(self):
     duration = timer.delta_sec(self.start_time, self.end_time)
     read_rate = self.records_read / duration
     mb_read = self.bytes_read / 1024.0 / 1024.0
     mb_read_rate = mb_read / duration
     write_rate = self.records_written / duration
     mb_written = self.bytes_written / 1024.0 / 1024.0
     mb_write_rate = mb_written / duration
     print "%s: Read %d records or %.2fMB (%.2fr/s, %.2fMB/s), wrote %d or %.2f MB (%.2fr/s, %.2fMB/s). Found %d bad records" % (self.label, self.records_read, mb_read, read_rate, mb_read_rate, self.records_written, mb_written, write_rate, mb_write_rate, self.bad_records)
示例#16
0
    def handle(self, record):
        filename = record
        base_ends = filename.find(".log") + 4
        if base_ends < 4:
            self.log("Bad filename encountered, skipping: " + filename)
            self.stats.increment(records_read=1,
                                 bad_records=1,
                                 bad_record_type="bad_filename")
            return
        basename = filename[0:base_ends]
        # Get a unique name for the compressed file:
        comp_name = basename + "." + uuid.uuid4(
        ).hex + StorageLayout.COMPRESSED_SUFFIX

        # reserve it!
        f_comp = open(comp_name, "wb")
        # TODO: open f_comp with same buffer size as below?

        # Rename uncompressed file to a temp name
        tmp_name = comp_name + ".compressing"
        os.rename(filename, tmp_name)

        # Read input file as text (line-buffered)
        f_raw = open(tmp_name, "r", 1)
        start = now()

        # Now set up our processing pipe:
        # - read from f_raw, compress, write to comp_name
        p_compress = Popen(self.compress_cmd,
                           bufsize=65536,
                           stdin=f_raw,
                           stdout=f_comp,
                           stderr=sys.stderr)

        # Note: it looks like p_compress.wait() is what we want, but the docs
        #       warn of a deadlock, so we use communicate() instead.
        p_compress.communicate()

        raw_bytes = f_raw.tell()
        comp_bytes = f_comp.tell()
        raw_mb = float(raw_bytes) / 1024.0 / 1024.0
        comp_mb = float(comp_bytes) / 1024.0 / 1024.0
        f_raw.close()
        f_comp.close()

        self.stats.increment(records_read=1,
                             records_written=1,
                             bytes_read=raw_bytes,
                             bytes_written=comp_bytes)

        # Remove raw file
        os.remove(tmp_name)
        sec = timer.delta_sec(start, now())
        self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \
                 " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name,
                    sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
示例#17
0
    def import_files(self, input_directory):
        begin = datetime.now()
        processes = []

        self._enqueue_process(partial(self._master, input_directory), processes)
        for worker in range(0, self._n_workers):
            self._enqueue_process(partial(self._worker), processes)

        for p in processes:
            p.join()

        print("Files imported in", timer.delta_sec(begin), "seconds.")
示例#18
0
    def import_files(self, input_directory):
        begin = datetime.now()
        processes = []

        self._enqueue_process(partial(self._master, input_directory),
                              processes)
        for worker in range(0, self._n_workers):
            self._enqueue_process(partial(self._worker), processes)

        for p in processes:
            p.join()

        print("Files imported in", timer.delta_sec(begin), "seconds.")
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key):
    result = 0
    if len(incoming_files) > 0:
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)

        files = []
        for f in incoming_files:
            full_filename = os.path.join(fetch_cwd, f)
            if os.path.isfile(full_filename):
                md5, size = fileutil.md5file(full_filename)
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    files.append(f)
                else:
                    print "Already downloaded", f
            else:
                files.append(f)
        fetch_cmd = [S3FUNNEL_PATH]
        fetch_cmd.append(bucket.name)
        fetch_cmd.append("get")
        fetch_cmd.append("-a")
        fetch_cmd.append(aws_key)
        fetch_cmd.append("-s")
        fetch_cmd.append(aws_secret_key)
        fetch_cmd.append("-t")
        fetch_cmd.append("8")
        # Fetch in batches of 8 files at a time
        while len(files) > 0:
            current_files = files[0:8]
            files = files[8:]
            start = datetime.now()
            result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd)
            duration_sec = timer.delta_sec(start)
            # TODO: verify MD5s
            downloaded_bytes = sum([
                os.path.getsize(os.path.join(fetch_cwd, f))
                for f in current_files
            ])
            downloaded_mb = downloaded_bytes / 1024.0 / 1024.0
            print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
                downloaded_mb, duration_sec, downloaded_mb / duration_sec)
            if result != 0:
                break
    return result
    def handle(self, record):
        filename = record
        base_ends = filename.find(".log") + 4
        if base_ends < 4:
            self.log("Bad filename encountered, skipping: " + filename)
            self.stats.increment(records_read=1, bad_records=1,
                    bad_record_type="bad_filename")
            return
        basename = filename[0:base_ends]
        # Get a unique name for the compressed file:
        comp_name = basename + "." + uuid.uuid4().hex + StorageLayout.COMPRESSED_SUFFIX

        # reserve it!
        f_comp = open(comp_name, "wb")
        # TODO: open f_comp with same buffer size as below?

        # Rename uncompressed file to a temp name
        tmp_name = comp_name + ".compressing"
        os.rename(filename, tmp_name)

        # Read input file as text (line-buffered)
        f_raw = open(tmp_name, "r", 1)
        start = now()

        # Now set up our processing pipe:
        # - read from f_raw, compress, write to comp_name
        p_compress = Popen(self.compress_cmd, bufsize=65536, stdin=f_raw,
                stdout=f_comp, stderr=sys.stderr)

        # Note: it looks like p_compress.wait() is what we want, but the docs
        #       warn of a deadlock, so we use communicate() instead.
        p_compress.communicate()

        raw_bytes = f_raw.tell()
        comp_bytes = f_comp.tell()
        raw_mb = float(raw_bytes) / 1024.0 / 1024.0
        comp_mb = float(comp_bytes) / 1024.0 / 1024.0
        f_raw.close()
        f_comp.close()

        self.stats.increment(records_read=1, records_written=1,
                bytes_read=raw_bytes, bytes_written=comp_bytes)

        # Remove raw file
        os.remove(tmp_name)
        sec = timer.delta_sec(start, now())
        self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \
                 " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name,
                    sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
示例#21
0
 def get_summary(self):
     duration = timer.delta_sec(self.start_time, self.end_time)
     read_rate = self.overall["records_read"] / duration
     mb_read = self.overall["bytes_read"] / 1024.0 / 1024.0
     mb_read_rate = mb_read / duration
     write_rate = self.overall["records_written"] / duration
     mb_written = self.overall["bytes_written"] / 1024.0 / 1024.0
     mb_write_rate = mb_written / duration
     summary = "Read %d records or %.2fMB (%.2fr/s, %.2fMB/s), " \
               "wrote %d or %.2f MB (%.2fr/s, %.2fMB/s). " \
               "Found %d bad records" % (self.overall["records_read"],
                 mb_read, read_rate, mb_read_rate,
                 self.overall["records_written"], mb_written, write_rate,
                 mb_write_rate, self.overall["bad_records"])
     return summary
    def save_map(self, channel_name, chan_stats):
        if self.stats_file is None:
            return;

        chan_stats["task"] = self.task
        chan_stats["channel"] = channel_name
        chan_stats["start_time"] = datetime_to_json(self.start_time)
        chan_stats["end_time"] = datetime_to_json(self.end_time)
        chan_stats["duration"] = timer.delta_sec(self.start_time, self.end_time)
        try:
            with io.open(self.stats_file, "a") as fout:
                fout.write(unicode(json.dumps(chan_stats) + u"\n"))
        except:
            self.logger.log("Error writing '{}' stats".format(channel_name))
            self.logger.log(traceback.format_exc())
 def get_summary(self):
     duration = timer.delta_sec(self.start_time, self.end_time)
     read_rate = self.overall["records_read"] / duration
     mb_read = self.overall["bytes_read"] / 1024.0 / 1024.0
     mb_read_rate = mb_read / duration
     write_rate = self.overall["records_written"] / duration
     mb_written = self.overall["bytes_written"] / 1024.0 / 1024.0
     mb_write_rate = mb_written / duration
     summary = "Read %d records or %.2fMB (%.2fr/s, %.2fMB/s), " \
               "wrote %d or %.2f MB (%.2fr/s, %.2fMB/s). " \
               "Found %d bad records" % (self.overall["records_read"],
                 mb_read, read_rate, mb_read_rate,
                 self.overall["records_written"], mb_written, write_rate,
                 mb_write_rate, self.overall["bad_records"])
     return summary
示例#24
0
    def save_map(self, channel_name, chan_stats):
        if self.stats_file is None:
            return;

        chan_stats["task"] = self.task
        chan_stats["channel"] = channel_name
        chan_stats["start_time"] = datetime_to_json(self.start_time)
        chan_stats["end_time"] = datetime_to_json(self.end_time)
        chan_stats["duration"] = timer.delta_sec(self.start_time, self.end_time)
        try:
            with io.open(self.stats_file, "a") as fout:
                fout.write(unicode(json.dumps(chan_stats) + u"\n"))
        except:
            self.logger.log("Error writing '{}' stats".format(channel_name))
            self.logger.log(traceback.format_exc())
示例#25
0
    def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket):
        self.work_dir = work_dir

        print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete."

        bytes_total = sum([f.size for f in inputs])
        bytes_completed = 0
        next_notice_pct = 5
        start = datetime.now()

        loader = None
        output_file = os.path.join(work_dir, "mapper_" + str(mapper_id))
        mapfunc = getattr(module, 'map', None)
        context = Context(output_file, partition_count)
        if not callable(mapfunc):
            print "No map function!!!"
            sys.exit(1)

        for input_file in inputs:
            if input_file.remote:
                # Lazy load the loader (so we don't do it on "local only" jobs).
                if loader is None:
                    loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1)

                for local, remote, err in loader.get_list([input_file.name]):
                    if err is not None:
                        print "Failed to download", remote, ":", err
            line_num = 0
            full_filename = os.path.join(self.work_dir, "cache", input_file.name)

            for r, _ in heka_message.unpack_file(full_filename):
                msg = heka_message_parser.parse_heka_record(r)
                line_num += 1
                try:
                    mapfunc(msg["meta"]["documentId"], msg, context)
                except ValueError, e:
                    # TODO: increment "bad line" metrics.
                    print "Bad record:", input_file.name, ":", line_num, e
            if delete_files:
                os.remove(full_filename)

            bytes_completed += input_file.size
            completed_pct = (float(bytes_completed) / bytes_total) * 100
            if completed_pct >= next_notice_pct:
                next_notice_pct += 5
                duration_sec = timer.delta_sec(start)
                completed_mb = float(bytes_completed) / 1024.0 / 1024.0
                print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
示例#26
0
    def handle(self, record):
        filename = record
        base_ends = filename.find(".log") + 4
        if base_ends < 4:
            self.log("Bad filename encountered, skipping: " + filename)
            self.stats.increment(records_read=1,
                                 bad_records=1,
                                 bad_record_type="bad_filename")
            return
        basename = filename[0:base_ends]
        # Get a unique name for the compressed file:
        comp_name = basename + "." + uuid.uuid4(
        ).hex + StorageLayout.COMPRESSED_SUFFIX
        comp_file = CompressedFile(comp_name,
                                   mode="w",
                                   open_now=True,
                                   compression_level=1)

        # Rename uncompressed file to a temp name
        tmp_name = comp_name + ".compressing"
        os.rename(filename, tmp_name)

        start = now()
        try:
            comp_file.compress_from(tmp_name, remove_original=False)
            comp_file.close()
        except Exception as e:
            self.stats.increment(records_read=1,
                                 bad_records=1,
                                 bad_record_type="compression_error")
            self.log("Error compressing file {0}: {1}".format(filename, e))
            return
        raw_bytes = os.stat(tmp_name).st_size
        comp_bytes = os.stat(comp_name).st_size
        raw_mb = float(raw_bytes) / 1024.0 / 1024.0
        comp_mb = float(comp_bytes) / 1024.0 / 1024.0

        self.stats.increment(records_read=1,
                             records_written=1,
                             bytes_read=raw_bytes,
                             bytes_written=comp_bytes)

        # Remove raw file
        os.remove(tmp_name)
        sec = timer.delta_sec(start, now())
        self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \
                 " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name,
                    sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key):
    result = 0
    if len(incoming_files) > 0:
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)

        files = []
        for f in incoming_files:
            full_filename = os.path.join(fetch_cwd, f)
            if os.path.isfile(full_filename):
                md5, size = fileutil.md5file(full_filename)
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    files.append(f)
                else:
                    print "Already downloaded", f
            else:
                files.append(f)
        fetch_cmd = [S3FUNNEL_PATH]
        fetch_cmd.append(bucket.name)
        fetch_cmd.append("get")
        fetch_cmd.append("-a")
        fetch_cmd.append(aws_key)
        fetch_cmd.append("-s")
        fetch_cmd.append(aws_secret_key)
        fetch_cmd.append("-t")
        fetch_cmd.append("8")
        # Fetch in batches of 8 files at a time
        while len(files) > 0:
            current_files = files[0:8]
            files = files[8:]
            start = datetime.now()
            result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd)
            duration_sec = timer.delta_sec(start)
            # TODO: verify MD5s
            downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in current_files ])
            downloaded_mb = downloaded_bytes / 1024.0 / 1024.0
            print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec)
            if result != 0:
                break
    return result
 def work(self):
     print self.label, "Starting up"
     while True:
         try:
             raw = self.q_in.get()
             if raw == PipeStep.SENTINEL:
                 break
             self.handle(raw)
             self.records_read += 1
             if self.print_stats:
                 this_update = datetime.now()
                 if timer.delta_sec(self.last_update, this_update) > 10.0:
                     self.last_update = this_update
                     self.dump_stats()
             self.end_time = datetime.now()
         except Q.Empty:
             break
     print self.label, "Received stop message... all done"
 def work(self):
     print self.label, "Starting up"
     while True:
         try:
             raw = self.q_in.get()
             if raw == PipeStep.SENTINEL:
                 break
             self.handle(raw)
             self.records_read += 1
             if self.print_stats:
                 this_update = datetime.now()
                 if timer.delta_sec(self.last_update, this_update) > 10.0:
                     self.last_update = this_update
                     self.dump_stats()
             self.end_time = datetime.now()
         except Q.Empty:
             break
     print self.label, "Received stop message... all done"
示例#30
0
def main():
    parser = argparse.ArgumentParser(description='Run a MapReduce Job.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("job_script", help="The MapReduce script to run")
    parser.add_argument("-l", "--local-only", help="Only process local files (exclude S3 data)", action="store_true")
    parser.add_argument("-m", "--num-mappers", metavar="N", help="Start N mapper processes", type=int, default=4)
    parser.add_argument("-r", "--num-reducers", metavar="N", help="Start N reducer processes", type=int, default=1)
    parser.add_argument("-d", "--data-dir", help="Base data directory", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key", default=None)
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", default=None)
    parser.add_argument("-w", "--work-dir", help="Location to put temporary work files", default="/tmp/telemetry_mr")
    parser.add_argument("-o", "--output", help="Filename to use for final job output", required=True)
    #TODO: make the input filter optional, default to "everything valid" and generate dims intelligently.
    parser.add_argument("-f", "--input-filter", help="File containing filter spec", required=True)
    parser.add_argument("-v", "--verbose", help="Print verbose output", action="store_true")
    parser.add_argument("-X", "--delete-data", help="Delete raw data files after mapping", action="store_true")
    parser.add_argument("-p", "--profile", help="Profile mappers and reducers using cProfile", action="store_true")
    args = parser.parse_args()

    if not args.local_only:
        if not BOTO_AVAILABLE:
            print "ERROR: The 'boto' library is required except in 'local-only' mode."
            print "       You can install it using `sudo pip install boto`"
            parser.print_help()
            return -2
        # If we want to process remote data, some more arguments are required.
        for remote_req in ["bucket"]:
            if not hasattr(args, remote_req) or getattr(args, remote_req) is None:
                print "ERROR:", remote_req, "is a required option"
                parser.print_help()
                return -1

    args = args.__dict__
    job = Job(args)
    start = datetime.now()
    exit_code = 0
    try:
        job.mapreduce()
    except:
        traceback.print_exc(file=sys.stderr)
        exit_code = 2
    duration = timer.delta_sec(start)
    print "All done in %.2fs" % (duration)
    return exit_code
示例#31
0
 def work(self):
     self.log("Starting up")
     while True:
         try:
             raw = self.q_in.get()
             if raw == PipeStep.SENTINEL:
                 break
             self.stats.reset()
             self.handle(raw)
             self.stats.update_end_time()
             self.stats.save()
             if self.print_stats:
                 this_update = now()
                 if timer.delta_sec(self.last_update, this_update) > 10.0:
                     self.last_update = this_update
                     self.log(self.stats.get_summary())
         except Q.Empty:
             break
     self.log("Received stop message... work done")
 def work(self):
     self.log("Starting up")
     while True:
         try:
             raw = self.q_in.get()
             if raw == PipeStep.SENTINEL:
                 break
             self.stats.reset()
             self.handle(raw)
             self.stats.update_end_time()
             self.stats.save()
             if self.print_stats:
                 this_update = now()
                 if timer.delta_sec(self.last_update, this_update) > 10.0:
                     self.last_update = this_update
                     self.log(self.stats.get_summary())
         except Q.Empty:
             break
     self.log("Received stop message... work done")
    def handle(self, record):
        filename = record
        base_ends = filename.find(".log") + 4
        if base_ends < 4:
            self.log("Bad filename encountered, skipping: " + filename)
            self.stats.increment(records_read=1, bad_records=1,
                    bad_record_type="bad_filename")
            return
        basename = filename[0:base_ends]
        # Get a unique name for the compressed file:
        comp_name = basename + "." + uuid.uuid4().hex + StorageLayout.COMPRESSED_SUFFIX
        comp_file = CompressedFile(comp_name, mode="w", open_now=True, compression_level=1)

        # Rename uncompressed file to a temp name
        tmp_name = comp_name + ".compressing"
        os.rename(filename, tmp_name)

        start = now()
        try:
            comp_file.compress_from(tmp_name, remove_original=False)
            comp_file.close()
        except Exception as e:
            self.stats.increment(records_read=1, bad_records=1,
                    bad_record_type="compression_error")
            self.log("Error compressing file {0}: {1}".format(filename, e))
            return
        raw_bytes = os.stat(tmp_name).st_size
        comp_bytes = os.stat(comp_name).st_size
        raw_mb = float(raw_bytes) / 1024.0 / 1024.0
        comp_mb = float(comp_bytes) / 1024.0 / 1024.0

        self.stats.increment(records_read=1, records_written=1,
                bytes_read=raw_bytes, bytes_written=comp_bytes)

        # Remove raw file
        os.remove(tmp_name)
        sec = timer.delta_sec(start, now())
        self.log("Compressed %s as %s in %.2fs. Size before: %.2fMB, after:" \
                 " %.2fMB (r: %.2fMB/s, w: %.2fMB/s)" % (filename, comp_name,
                    sec, raw_mb, comp_mb, (raw_mb/sec), (comp_mb/sec)))
示例#34
0
 def get_filtered_files_s3(self):
     if not self._local_only:
         print "Fetching file list from S3..."
         # Plain boto should be fast enough to list bucket contents.
         if self._aws_key is not None:
             conn = S3Connection(self._aws_key, self._aws_secret_key)
         else:
             conn = S3Connection()
         bucket = conn.get_bucket(self._bucket_name)
         start = datetime.now()
         count = 0
         # Filter input files by partition. If the filter is reasonably
         # selective, this can be much faster than listing all files in the
         # bucket.
         for f in s3util.list_partitions(bucket, schema=self._input_filter, include_keys=True):
             count += 1
             if count == 1 or count % 1000 == 0:
                 print "Listed", count, "so far"
             yield f
         conn.close()
         duration = timer.delta_sec(start)
         print "Listed", count, "files in", duration, "seconds"
def fetch_s3_files(files, fetch_cwd, bucket_name, aws_key, aws_secret_key):
    result = 0
    if len(files) > 0:
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        fetch_cmd = ["/usr/local/bin/s3funnel"]
        fetch_cmd.append(bucket_name)
        fetch_cmd.append("get")
        fetch_cmd.append("-a")
        fetch_cmd.append(aws_key)
        fetch_cmd.append("-s")
        fetch_cmd.append(aws_secret_key)
        fetch_cmd.append("-t")
        fetch_cmd.append("8")
        start = datetime.now()
        result = subprocess.call(fetch_cmd + files, cwd=fetch_cwd)
        duration_sec = timer.delta_sec(start)
        # TODO: verify MD5s
        downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in files ])
        downloaded_mb = downloaded_bytes / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec)
    return result
def fetch_s3_files(files, fetch_cwd, bucket_name, aws_key, aws_secret_key):
    result = 0
    if len(files) > 0:
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        fetch_cmd = ["/usr/local/bin/s3funnel"]
        fetch_cmd.append(bucket_name)
        fetch_cmd.append("get")
        fetch_cmd.append("-a")
        fetch_cmd.append(aws_key)
        fetch_cmd.append("-s")
        fetch_cmd.append(aws_secret_key)
        fetch_cmd.append("-t")
        fetch_cmd.append("8")
        start = datetime.now()
        result = subprocess.call(fetch_cmd + files, cwd=fetch_cwd)
        duration_sec = timer.delta_sec(start)
        # TODO: verify MD5s
        downloaded_bytes = sum(
            [os.path.getsize(os.path.join(fetch_cwd, f)) for f in files])
        downloaded_mb = downloaded_bytes / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
            downloaded_mb, duration_sec, downloaded_mb / duration_sec)
    return result
示例#37
0
def list_files(bucket_name, output_file, output_func=s3obj_to_string, prefix=''):
    s3 = S3Connection()
    bucket = s3.get_bucket(bucket_name)
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    while not done:
        try:
            for k in bucket.list(prefix=prefix, marker=last_key):
                last_key = k.name
                total_count += 1
                if total_count % 5000 == 0:
                    print "Looked at", total_count, "total records in", timer.delta_sec(start_time), "seconds. Last key was", last_key
                try:
                    output_file.write(str(output_func(k)) + "\n")
                except Exception, e:
                    print "Error writing key", k.name, ":", e
                    traceback.print_exc()
            done = True
        except socket.error, e:
            print "Error listing keys:", e
            traceback.print_exc()
            print "Continuing from last seen key:", last_key
示例#38
0
class ReadRawStep(PipeStep):
    UUID_ONLY_PATH = re.compile(
        '^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$')

    def __init__(self, num, name, raw_files, completed_files, log_file,
                 stats_file, schema, converter, storage, bad_filename):
        self.schema = schema
        self.converter = converter
        self.storage = storage
        self.bad_filename = bad_filename
        PipeStep.__init__(self, num, name, raw_files, completed_files,
                          log_file, stats_file)

    def setup(self):
        self.expected_dim_count = len(self.schema._dimensions)

    def handle(self, raw_file):
        self.log("Reading " + raw_file)
        try:
            record_count = 0
            bytes_read = 0
            start = now()
            file_version = fileutil.detect_file_version(raw_file,
                                                        simple_detection=True)
            self.log("Detected version {0} for file {1}".format(
                file_version, raw_file))
            for unpacked in fileutil.unpack(raw_file,
                                            file_version=file_version):
                record_count += 1
                common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[
                    file_version]
                current_bytes = common_bytes + unpacked.len_data
                current_bytes_uncompressed = common_bytes + len(unpacked.data)
                bytes_read += current_bytes
                if unpacked.error:
                    self.log("ERROR: Found corrupted data for record {0} in " \
                             "{1} path: {2} Error: {3}".format(record_count,
                                 raw_file, unpacked.path, unpacked.error))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type="corrupted_data")
                    continue
                if len(unpacked.data) == 0:
                    self.log("WARN: Found empty data for record {0} in " \
                             "{2} path: {2}".format(record_count, raw_file,
                                 unpacked.path))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type="empty_data")
                    continue

                submission_date = ts_to_yyyymmdd(unpacked.timestamp)
                path = unicode(unpacked.path, errors="replace")

                if unpacked.data[0] != "{":
                    # Data looks weird, should be JSON.
                    self.log("Warning: Found unexpected data for record {0}" \
                             " in {1} path: {2} data:\n{3}".format(record_count,
                                 raw_file, path, unpacked.data))
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    unpacked.data = unicode(unpacked.data, errors="replace")

                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    bad_record_type = "invalid_path"
                    if ReadRawStep.UUID_ONLY_PATH.match(path):
                        bad_record_type = "uuid_only_path"
                    else:
                        self.log("Found an invalid path in record {0}: " \
                             "{1}".format(record_count, path))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type=bad_record_type)
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)
                channel = self.schema.get_field(dims, "appUpdateChannel", True,
                                                True)

                self.stats.increment(
                    channel=channel,
                    records_read=1,
                    bytes_read=current_bytes,
                    bytes_uncompressed=current_bytes_uncompressed)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = unpacked.data
                        # TODO: Converter.VERSION_UNCONVERTED
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(
                            unpacked.data, dims[-1], unpacked.ip)
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                self.log("Record {0} mismatched dimension " \
                                         "{1}: '{2}' != '{3}'".format(
                                            record_count, i, dims[1],
                                            parsed_dims[i]))
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        # TODO: Converter.VERSION_CONVERTED
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims,
                                               data_version)
                        self.stats.increment(channel=channel,
                                             records_written=1,
                                             bytes_written=len(key) +
                                             len(serialized_data) + 2)
                        # Compress rotated files as we generate them
                        if n.endswith(
                                StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data,
                                              str(e),
                                              "ERROR Writing to output file:",
                                              "write_failed")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, unpacked.data, e.msg,
                                          "Bad Payload:", "bad_payload")
                except Exception, e:
                    err_message = str(e)
                    if err_message == "Missing in payload: info.revision":
                        # We don't need to write these bad records out - we know
                        # why they are being skipped.
                        self.stats.increment(
                            channel=channel,
                            bad_records=1,
                            bad_record_type="missing_revision")
                    elif err_message == "Invalid revision URL: /rev/":
                        # We do want to log these payloads, but we don't want
                        # the full stack trace.
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "missing_revision_repo")
                    # Don't split this long string - we want to be able to find it in the code
                    elif err_message.startswith(
                            "JSONDecodeError: Invalid control character"):
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "invalid_control_char")
                    else:
                        # TODO: Recognize other common failure modes and handle
                        #       them gracefully.
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "conversion_error")
                        self.log(traceback.format_exc())

                if self.print_stats:
                    this_update = now()
                    sec = timer.delta_sec(self.last_update, this_update)
                    if sec > 10.0:
                        self.last_update = this_update
                        self.log(self.stats.get_summary())
def main():
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("incoming_bucket",
                        help="The S3 bucket containing incoming files")
    parser.add_argument("publish_bucket",
                        help="The S3 bucket to save processed files")
    parser.add_argument("-k", "--aws-key", help="AWS Key", required=True)
    parser.add_argument("-s",
                        "--aws-secret-key",
                        help="AWS Secret Key",
                        required=True)
    parser.add_argument("-r",
                        "--aws-region",
                        help="AWS Region",
                        default="us-west-2")
    parser.add_argument("-w",
                        "--work-dir",
                        help="Location to cache downloaded files",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base dir to store processed data",
                        required=True)
    parser.add_argument("-i",
                        "--input-files",
                        help="File containing a list of keys to process",
                        type=file)
    parser.add_argument("-b",
                        "--bad-data-log",
                        help="Save bad records to this file")
    parser.add_argument("-q",
                        "--queue",
                        help="SQS Queue name to poll for incoming data")
    parser.add_argument("-c",
                        "--histogram-cache-path",
                        help="Path to store a local cache of histograms",
                        default="./histogram_cache")
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Location of the desired telemetry schema",
                        required=True)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-D",
                        "--dry-run",
                        help="Don't modify remote files",
                        action="store_true")
    parser.add_argument("-C",
                        "--skip-conversion",
                        help="Skip validation/conversion of payloads",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.isfile(S3FUNNEL_PATH):
        print "ERROR: s3funnel not found at", S3FUNNEL_PATH
        print "You can get it from github: https://github.com/sstoiana/s3funnel"
        return -1

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    if args.skip_conversion:
        converter = None
    else:
        converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    num_cpus = multiprocessing.cpu_count()

    start = datetime.now()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    incoming_queue_messages = []

    if not args.dry_run:
        conn = S3Connection(args.aws_key, args.aws_secret_key)
        incoming_bucket = conn.get_bucket(args.incoming_bucket)

    incoming_filenames = []
    if args.queue is not None:
        print "Fetching file list from queue", args.queue
        if args.dry_run:
            print "Dry run mode... can't read from the queue without messing things up..."
        else:
            q_conn = boto.sqs.connect_to_region(
                args.aws_region,
                aws_access_key_id=args.aws_key,
                aws_secret_access_key=args.aws_secret_key)
            incoming_queue = q_conn.get_queue(args.queue)
            if incoming_queue is None:
                print "Error: could not get queue", args.queue
                return -2
            # Sometimes we don't get all the messages, even if more are
            # available, so keep trying until we have enough (or there aren't
            # any left)
            for i in range(num_cpus):
                messages = incoming_queue.get_messages(num_cpus -
                                                       len(incoming_filenames))
                for m in messages:
                    # TODO: Make sure this file exists in S3 first?
                    possible_filename = m.get_body()
                    key = incoming_bucket.get_key(possible_filename)
                    if key is None:
                        print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename
                        # try to delete it:
                        incoming_queue.delete_message(m)
                    else:
                        incoming_filenames.append(possible_filename)
                        incoming_queue_messages.append(m)
                if len(messages) == 0 or len(incoming_filenames) >= num_cpus:
                    break
    elif args.input_files:
        print "Fetching file list from file", args.input_files
        incoming_filenames = [l.strip() for l in args.input_files.readlines()]
    else:
        print "Fetching file list from S3..."
        for f in incoming_bucket.list():
            incoming_filenames.append(f.name)
    print "Done"

    if len(incoming_filenames) == 0:
        print "Nothing to do!"
        return 0

    for f in incoming_filenames:
        print "  ", f

    print "Verifying that we can write to", args.publish_bucket
    if args.dry_run:
        print "Dry run mode: don't care!"
    else:
        try:
            publish_bucket = conn.get_bucket(args.publish_bucket)
            print "Looks good!"
        except S3ResponseError:
            print "Bucket", args.publish_bucket, "not found.  Attempting to create it."
            publish_bucket = conn.create_bucket(args.publish_bucket)

    result = 0
    print "Downloading", len(incoming_filenames), "files..."
    if args.dry_run:
        print "Dry run mode: skipping download from S3"
    else:
        result = fetch_s3_files(incoming_filenames, args.work_dir,
                                incoming_bucket, args.aws_key,
                                args.aws_secret_key)

    if result != 0:
        print "Error downloading files. Return code of s3funnel was", result
        return result
    print "Done"

    after_download = datetime.now()

    local_filenames = [
        os.path.join(args.work_dir, f) for f in incoming_filenames
    ]

    # TODO: try a SimpleQueue
    raw_files = Queue()
    for l in local_filenames:
        raw_files.put(l)

    completed_files = Queue()
    compressed_files = Queue()

    # Begin reading raw input
    raw_readers = start_workers(
        num_cpus, "Reader", ReadRawStep, raw_files,
        (completed_files, schema, converter, storage, args.bad_data_log))

    # Tell readers when to stop:
    for i in range(num_cpus):
        raw_files.put(PipeStep.SENTINEL)

    # Compress completed files.
    compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep,
                                completed_files, (compressed_files, ))

    # Export compressed files to S3.
    exporters = start_workers(
        num_cpus, "Exporter", ExportCompressedStep, compressed_files,
        (args.output_dir, args.aws_key, args.aws_secret_key,
         args.publish_bucket, args.dry_run))

    wait_for(raw_readers, "Raw Readers")

    # `find <out_dir> -type f -not -name ".compressme"`
    # Add them to completed_files
    for root, dirs, files in os.walk(args.output_dir):
        for f in files:
            if f.endswith(".log"):
                completed_files.put(os.path.join(root, f))

    for i in range(num_cpus):
        completed_files.put(PipeStep.SENTINEL)

    wait_for(compressors, "Compressors")
    for i in range(num_cpus):
        compressed_files.put(PipeStep.SENTINEL)

    wait_for(exporters, "Exporters")

    print "Removing processed logs from S3..."
    for f in incoming_filenames:
        if args.dry_run:
            print "  Dry run, so not really deleting", f
        else:
            print "  Deleting", f
            incoming_bucket.delete_key(f)
            # Delete file locally too.
            os.remove(os.path.join(args.work_dir, f))
    print "Done"

    if len(incoming_queue_messages) > 0:
        print "Removing processed messages from SQS..."
        for m in incoming_queue_messages:
            if args.dry_run:
                print "  Dry run, so not really deleting", m.get_body()
            else:
                print "  Deleting", m.get_body()
                if incoming_queue.delete_message(m):
                    print "  Message deleted successfully"
                else:
                    print "  Failed to delete message :("
        print "Done"

    duration = timer.delta_sec(start)
    print "All done in %.2fs (%.2fs excluding download time)" % (
        duration, timer.delta_sec(after_download))
    return 0
示例#40
0
def main():
    parser = argparse.ArgumentParser(
        description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000
    )
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file):
        record_count += 1
        if err:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(path, errors="replace")
        data = unicode(data, errors="replace")

        bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH
        # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        # print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read,
        duration,
        mb_read / duration,
        bad_record_count,
        record_count,
    )
    return 0
示例#41
0
                total_count += 1
                if total_count % 5000 == 0:
                    print "Looked at", total_count, "total records in", timer.delta_sec(start_time), "seconds. Last key was", last_key
                try:
                    output_file.write(str(output_func(k)) + "\n")
                except Exception, e:
                    print "Error writing key", k.name, ":", e
                    traceback.print_exc()
            done = True
        except socket.error, e:
            print "Error listing keys:", e
            traceback.print_exc()
            print "Continuing from last seen key:", last_key

    output_file.close()
    print "Overall, listed", total_count, "in", timer.delta_sec(start_time), "seconds"

def main():
    parser = argparse.ArgumentParser(description="List S3 contents (with retry) to a file")
    parser.add_argument("--output-file", type=argparse.FileType('w'))
    parser.add_argument("--bucket", default="telemetry-published-v2")
    parser.add_argument("--prefix", default="")
    parser.add_argument("--verbose", action="store_true")
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    
    if args.debug:
        boto.set_stream_logger('boto')

    list_files(args.bucket, args.output_file, prefix=args.prefix)
                                "Conversion Error", "invalid_control_char")
                    else:
                        # TODO: Recognize other common failure modes and handle
                        #       them gracefully.
                        self.write_bad_record(key, dims, unpacked.data, err_message,
                                "Conversion Error", "conversion_error")
                        self.log(traceback.format_exc())

                if self.print_stats:
                    this_update = now()
                    sec = timer.delta_sec(self.last_update, this_update)
                    if sec > 10.0:
                        self.last_update = this_update
                        self.log(self.stats.get_summary())

            duration = timer.delta_sec(start, now())
            mb_read = bytes_read / 1024.0 / 1024.0
            # Stats for the current file:
            self.log("Read %d records %.2fMB in %.2fs (%.2fMB/s)" % (
                    record_count, mb_read, duration, mb_read / duration))
        except Exception, e:
            # Corrupted data, let's skip this record.
            self.log("Error reading raw data from {0} {1}\n{2}".format(
                    raw_file, e, traceback.format_exc()))


    def write_bad_record(self, key, dims, data, error, message=None,
            bad_record_type=None):
        try:
            channel = self.schema.get_field(dims, "appUpdateChannel", True,
                    True)
                    # why they are being skipped.
                    if err_message != "Missing in payload: info.revision":
                        # TODO: recognize other common failure modes and handle them gracefully.
                        self.write_bad_record(key, dims, data, err_message,
                                              "Conversion Error:")
                        traceback.print_exc()

                if self.print_stats:
                    this_update = datetime.now()
                    sec = timer.delta_sec(self.last_update, this_update)
                    if sec > 10.0:
                        self.last_update = this_update
                        self.end_time = datetime.now()
                        self.dump_stats()

            duration = timer.delta_sec(start)
            mb_read = bytes_read / 1024.0 / 1024.0
            # Stats for the current file:
            print self.label, "- Read %d records %.2fMB in %.2fs (%.2fMB/s)" % (
                record_count, mb_read, duration, mb_read / duration)
        except Exception, e:
            # Corrupted data, let's skip this record.
            print self.label, "- Error reading raw data from ", raw_file, e
            traceback.print_exc()

    def write_bad_record(self, key, dims, data, error, message=None):
        self.bad_records += 1
        if message is not None:
            print self.label, message, error
        if self.bad_filename is not None:
            try:
                    continue
                exp_count += 1
                total_bytes += k.size
                last_key = k.name
                if total_count % 100 == 0:
                    logger.debug("Expired {} of {} total files in {}s. Last key was {}".format(
                        exp_count, total_count, timer.delta_sec(start_time), last_key))
                logger.info("Deleting {} from S3 bucket".format(k.name))
                sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name)
                if should_run(args.dry_run, logger, "Deleting from S3 bucket"):
                    k.delete()

                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    db_cursor.execute(sql_update)
                    db_conn.commit()
                    logger.debug("Coordinator notified")
            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.debug("Continuing from last seen key: {}".format(last_key))
    if db_conn is not None:
        db_conn.close()
    total_mb = round(total_bytes / 1024.0 / 1024.0, 2)
    logger.info("Overall, expired {} of {} files ({} MB) in {} seconds.".format(
        exp_count, total_count, total_mb, timer.delta_sec(start_time)))
    return 0

if __name__ == "__main__":
    sys.exit(main())
示例#45
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-f",
                        "--file-version",
                        help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp /
                                             1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(r.path, errors="replace")
        data = unicode(r.data, errors="replace")

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[
            file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0
                    # We don't need to write these bad records out - we know
                    # why they are being skipped.
                    if err_message != "Missing in payload: info.revision":
                        # TODO: recognize other common failure modes and handle them gracefully.
                        self.write_bad_record(key, dims, data, err_message, "Conversion Error:")
                        traceback.print_exc()

                if self.print_stats:
                    this_update = datetime.now()
                    sec = timer.delta_sec(self.last_update, this_update)
                    if sec > 10.0:
                        self.last_update = this_update
                        self.end_time = datetime.now()
                        self.dump_stats()

            duration = timer.delta_sec(start)
            mb_read = bytes_read / 1024.0 / 1024.0
            # Stats for the current file:
            print self.label, "- Read %d records %.2fMB in %.2fs (%.2fMB/s)" % (record_count, mb_read, duration, mb_read / duration)
        except Exception, e:
            # Corrupted data, let's skip this record.
            print self.label, "- Error reading raw data from ", raw_file, e
            traceback.print_exc()

    def write_bad_record(self, key, dims, data, error, message=None):
        self.bad_records += 1
        if message is not None:
            print self.label, message, error
        if self.bad_filename is not None:
            try:
                path = u"/".join([key] + dims)
                if total_count % 5000 == 0:
                    print "Looked at", total_count, "total records in", timer.delta_sec(
                        start_time), "seconds. Last key was", last_key
                try:
                    output_file.write(str(output_func(k)) + "\n")
                except Exception, e:
                    print "Error writing key", k.name, ":", e
                    traceback.print_exc()
            done = True
        except socket.error, e:
            print "Error listing keys:", e
            traceback.print_exc()
            print "Continuing from last seen key:", last_key

    output_file.close()
    print "Overall, listed", total_count, "in", timer.delta_sec(
        start_time), "seconds"


def main():
    parser = argparse.ArgumentParser(
        description="List S3 contents (with retry) to a file")
    parser.add_argument("--output-file", type=argparse.FileType('w'))
    parser.add_argument("--bucket", default="telemetry-published-v2")
    parser.add_argument("--prefix", default="")
    parser.add_argument("--verbose", action="store_true")
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()

    if args.debug:
        boto.set_stream_logger('boto')
示例#48
0
                    else:
                        # TODO: Recognize other common failure modes and handle
                        #       them gracefully.
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "conversion_error")
                        self.log(traceback.format_exc())

                if self.print_stats:
                    this_update = now()
                    sec = timer.delta_sec(self.last_update, this_update)
                    if sec > 10.0:
                        self.last_update = this_update
                        self.log(self.stats.get_summary())

            duration = timer.delta_sec(start, now())
            mb_read = bytes_read / 1024.0 / 1024.0
            # Stats for the current file:
            self.log("Read %d records %.2fMB in %.2fs (%.2fMB/s)" %
                     (record_count, mb_read, duration, mb_read / duration))
        except Exception, e:
            # Corrupted data, let's skip this record.
            self.log("Error reading raw data from {0} {1}\n{2}".format(
                raw_file, e, traceback.format_exc()))

    def write_bad_record(self,
                         key,
                         dims,
                         data,
                         error,
                         message=None,
示例#49
0
    def run_mapper(
        self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket
    ):
        self.work_dir = work_dir

        print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete."

        bytes_total = sum([f.size for f in inputs])
        bytes_completed = 0
        next_notice_pct = 5
        start = datetime.now()

        loader = None
        output_file = os.path.join(work_dir, "mapper_" + str(mapper_id))
        mapfunc = getattr(module, "map", None)
        context = Context(output_file, partition_count)
        if not callable(mapfunc):
            print "No map function!!!"
            sys.exit(1)

        for input_file in inputs:
            if input_file.remote:
                # Lazy load the loader (so we don't do it on "local only" jobs).
                if loader is None:
                    loader = s3util.Loader(
                        os.path.join(self.work_dir, "cache"),
                        s3_bucket,
                        aws_key=aws_key,
                        aws_secret_key=aws_secret_key,
                        poolsize=1,
                    )

                for local, remote, err in loader.get_list([input_file.name]):
                    if err is not None:
                        print "Failed to download", remote, ":", err
            line_num = 0
            full_filename = os.path.join(self.work_dir, "cache", input_file.name)

            for r, _ in heka_message.unpack_file(full_filename):
                msg = heka_message_parser.parse_heka_record(r)
                line_num += 1
                try:
                    mapfunc(msg["meta"]["documentId"], msg, context)
                except ValueError, e:
                    # TODO: increment "bad line" metrics.
                    print "Bad record:", input_file.name, ":", line_num, e
            if delete_files:
                os.remove(full_filename)

            bytes_completed += input_file.size
            completed_pct = (float(bytes_completed) / bytes_total) * 100
            if completed_pct >= next_notice_pct:
                next_notice_pct += 5
                duration_sec = timer.delta_sec(start)
                completed_mb = float(bytes_completed) / 1024.0 / 1024.0
                print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (
                    mapper_id,
                    completed_pct,
                    completed_mb,
                    duration_sec,
                    completed_mb / duration_sec,
                )
                    if should_run(args.dry_run, logger,
                                  "Deleting from source bucket"):
                        k.delete()
                else:
                    logger.info(
                        "Not deleting source: either non-empty or same bucket: {}"
                        .format(k.name))

                if sql_update is None:
                    logger.error("Missing sql_update :(")
                else:
                    logger.info(sql_update)
                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    #TODO
                    logger.debug("Should be actually notifying coordinator")

            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.info("Continuing from last seen key: {}".format(last_key))
    total_mb = round(total_bytes / 1024.0 / 1024.0, 2)
    logger.info("Total bytes: {}".format(total_bytes))
    logger.info("Overall, listed {} files ({} MB) in {} seconds.".format(
        total_count, total_mb, timer.delta_sec(start_time)))
    return 0


if __name__ == "__main__":
    sys.exit(main())
def main():
    signal.signal(signal.SIGINT, handle_sigint)
    parser = argparse.ArgumentParser(
            description='Process incoming Telemetry data',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-c", "--config", required=True, type=file,
            help="AWS Configuration file (json)")
    parser.add_argument("-w", "--work-dir", required=True,
            help="Location to cache downloaded files")
    parser.add_argument("-o", "--output-dir", required=True,
            help="Base dir to store processed data")
    parser.add_argument("-i", "--input-files", type=file,
            help="File containing a list of keys to process")
    parser.add_argument("-b", "--bad-data-log",
            help="Save bad records to this file")
    parser.add_argument("-l", "--log-file",
            help="Log output to this file")
    parser.add_argument("-s", "--stats-file",
            help="Log statistics to this file")
    parser.add_argument("--histogram-cache-path", default="./histogram_cache",
            help="Path to store a local cache of histograms")
    parser.add_argument("-t", "--telemetry-schema", required=True,
            help="Location of the desired telemetry schema")
    parser.add_argument("-m", "--max-output-size", metavar="N", type=int,
            default=500000000, help="Rotate output files after N bytes")
    parser.add_argument("-D", "--dry-run", action="store_true",
            help="Don't modify remote files")
    parser.add_argument("-v", "--verbose", action="store_true",
            help="Print more detailed output")
    args = parser.parse_args()

    if args.verbose:
        # Turn on mp logging
        multiprocessing.log_to_stderr(logging.DEBUG)

    config = json.load(args.config)
    # TODO: allow commandline args to override config values.

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)
    logger = Log(args.log_file, "Master")
    num_cpus = multiprocessing.cpu_count()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    s3downloader = None
    raw_readers = None
    compressors = None
    exporters = None
    done = False

    if not args.dry_run:
        # Set up AWS connections
        conn = S3Connection(config.get("aws_key", None), config.get(
                "aws_secret_key", None))
        incoming_bucket = conn.get_bucket(config["incoming_bucket"])
        q_conn = boto.sqs.connect_to_region(config.get("aws_region", None),
                aws_access_key_id=config.get("aws_key", None),
                aws_secret_access_key=config.get("aws_secret_key", None))
        incoming_queue = q_conn.get_queue(config["incoming_queue"])
        if incoming_queue is None:
            logger.log("Error: could not get queue " + config["incoming_queue"])
            return -2

        logger.log("Verifying that we can write to " + config["publish_bucket"])
        try:
            publish_bucket = conn.get_bucket(config["publish_bucket"])
            logger.log("Looks good!")
        except S3ResponseError:
            logger.log("Bucket {0} not found. Attempting to create it.".format(
                    config["publish_bucket"]))
            publish_bucket = conn.create_bucket(config["publish_bucket"])
        s3downloader = s3util.Loader(args.work_dir, config["incoming_bucket"],
                poolsize=num_cpus, aws_key=config.get("aws_key", None),
                aws_secret_key=config.get("aws_secret_key", None))

    while not done:
        if args.dry_run:
            done = True
        try:
            start = now()
            incoming_filenames = []
            incoming_queue_messages = []
            logger.log("Fetching file list from queue " + config["incoming_queue"])
            if args.dry_run:
                logger.log("Dry run mode... can't read from the queue " \
                           "without messing things up...")
                if args.input_files:
                    logger.log("Fetching file list from file {}".format(
                            args.input_files))
                    incoming_filenames = [ l.strip() for l in args.input_files.readlines() ]
            else:
                # Sometimes we don't get all the messages, even if more are
                # available, so keep trying until we have enough (or there
                # aren't any left)
                for i in range(num_cpus):
                    messages = incoming_queue.get_messages(num_cpus - len(incoming_filenames))
                    for m in messages:
                        # Make sure this file exists in S3 first
                        possible_filename = m.get_body()
                        key = incoming_bucket.get_key(possible_filename)
                        if key is None:
                            logger.log("Could not find queued filename in" \
                                       " bucket {0}: {1}".format(
                                            config["incoming_bucket"],
                                            possible_filename))
                            # try to delete it:
                            incoming_queue.delete_message(m)
                        else:
                            incoming_filenames.append(possible_filename)
                            incoming_queue_messages.append(m)
                    if len(messages) == 0 or len(incoming_filenames) >= num_cpus:
                        break
            logger.log("Done")

            if len(incoming_filenames) == 0:
                logger.log("Nothing to do! Sleeping...")
                time.sleep(5)
                continue

            for f in incoming_filenames:
                logger.log("  " + f)

            before_download = now()
            logger.log("Downloading {0} files...".format(len(incoming_filenames)))
            local_filenames = []
            download_stats = Stats("Downloader", args.stats_file, logger)
            if args.dry_run:
                logger.log("Dry run mode: skipping download from S3")
                local_filenames = [ os.path.join(args.work_dir, f) for f in incoming_filenames ]
            else:
                for local_filename, remote_filename, err in s3downloader.get_list(incoming_filenames):
                    if err is None:
                        local_filenames.append(local_filename)
                    else:
                        # s3downloader already retries 3 times.
                        logger.log("Error downloading {0} Error: {1}".format(
                                local_filename, err))
                        download_stats.increment(
                                records_read=len(incoming_filenames),
                                records_written=len(local_filenames),
                                bad_records=1)
                        download_stats.save()
                        return 2
            downloaded_bytes = sum([os.path.getsize(f) for f in local_filenames])
            download_stats.increment(records_read=len(incoming_filenames),
                    records_written=len(local_filenames),
                    bytes_read=downloaded_bytes,
                    bytes_written=downloaded_bytes)
            logger.log(download_stats.get_summary())
            download_stats.save()
            after_download = now()

            raw_files = Queue()
            for l in local_filenames:
                raw_files.put(l)

            completed_files = Queue()

            # Begin reading raw input
            raw_readers = start_workers(logger, num_cpus, "Reader", ReadRawStep,
                    raw_files, (completed_files, args.log_file, args.stats_file,
                    schema, converter, storage, args.bad_data_log))

            # Tell readers to stop when they get to the end:
            finish_queue(raw_files, num_cpus)

            # Compress completed files.
            compressors = start_workers(logger, num_cpus, "Compressor",
                    CompressCompletedStep, completed_files, (None,
                    args.log_file, args.stats_file))
            wait_for(logger, raw_readers, "Raw Readers")

            # `find <out_dir> -type f -not -name ".compressme"`
            # Add them to completed_files
            for root, dirs, files in os.walk(args.output_dir):
                for f in files:
                    if f.endswith(".log"):
                        completed_files.put(os.path.join(root, f))

            # Tell compressors to stop:
            finish_queue(completed_files, num_cpus)
            wait_for(logger, compressors, "Compressors")

            shutdown_requested = False
            try:
                # Export compressed files to S3.
                compressed_files = Queue()
                exporters = start_workers(logger, num_cpus, "Exporter",
                        ExportCompressedStep, compressed_files, (args.log_file,
                        args.stats_file, args.output_dir, config, args.dry_run))
                for root, dirs, files in os.walk(args.output_dir):
                    for f in files:
                        if f.endswith(StorageLayout.COMPRESSED_SUFFIX):
                            compressed_files.put(os.path.join(root, f))
                finish_queue(compressed_files, num_cpus)
                wait_for(logger, exporters, "Exporters")
            except InterruptProcessingError, e:
                logger.log("Received shutdown request... waiting for " \
                           "exporters to finish")
                shutdown_requested = True
                shutdown_stats = Stats("ShutdownDuringExport", args.stats_file,
                    logger)
                shutdown_stats.increment(records_read=1)
                shutdown_stats.save()
                done = True
                wait_for(logger, exporters, "Exporters")
                logger.log("OK, cleaning up")

            logger.log("Removing processed logs from S3...")
            for f in incoming_filenames:
                if args.dry_run:
                    logger.log("  Dry run, so not really deleting " + f)
                else:
                    logger.log("  Deleting " + f)
                    incoming_bucket.delete_key(f)
                    # Delete file locally too.
                    os.remove(os.path.join(args.work_dir, f))
            logger.log("Done")

            if len(incoming_queue_messages) > 0:
                logger.log("Removing processed messages from SQS...")
                for m in incoming_queue_messages:
                    if args.dry_run:
                        logger.log("  Dry run, so not really deleting " \
                                   "{0}".format(m.get_body()))
                    else:
                        logger.log("  Deleting {0}".format(m.get_body()))
                        if incoming_queue.delete_message(m):
                            logger.log("  Message deleted successfully")
                        else:
                            logger.log("  Failed to delete message :(")
                logger.log("Done")

            if shutdown_requested:
                shutdown_stats.increment(records_written=1)
                shutdown_stats.save()
            all_done = now()
            duration = timer.delta_sec(start, all_done)
            logger.log("All done in %.2fs (%.2fs excluding download time)" % (
                duration, timer.delta_sec(after_download, all_done)))
        except InterruptProcessingError, e:
            logger.log("Received normal shutdown request... quittin' time!")
            if raw_readers is not None:
                terminate(logger, raw_readers, "Readers")
            if compressors is not None:
                terminate(logger, compressors, "Compressors")
            if exporters is not None:
                terminate(logger, exporters, "Exporters")

            done = True
class ReadRawStep(PipeStep):
    def __init__(self, num, name, raw_files, completed_files, schema,
                 converter, storage, bad_filename):
        self.schema = schema
        self.converter = converter
        self.storage = storage
        self.bad_filename = bad_filename
        PipeStep.__init__(self, num, name, raw_files, completed_files)

    def setup(self):
        self.expected_dim_count = len(self.schema._dimensions)

    def handle(self, raw_file):
        print self.label, "reading", raw_file
        try:
            record_count = 0
            bytes_read = 0
            start = datetime.now()
            for len_path, len_data, timestamp, path, data, err in fileutil.unpack(
                    raw_file):
                record_count += 1
                self.records_read += 1
                if err:
                    print self.label, "ERROR: Found corrupted data for record", record_count, "in", raw_file, "path:", path, "Error:", err
                    self.bad_records += 1
                    continue
                if len(data) == 0:
                    print self.label, "ERROR: Found empty data for record", record_count, "in", raw_file, "path:", path
                    self.bad_records += 1
                    continue

                # Incoming timestamps are in milliseconds, so convert to POSIX first
                # (ie. seconds)
                submission_date = date.fromtimestamp(timestamp /
                                                     1000).strftime("%Y%m%d")
                path = fileutil.to_unicode(path)
                #print "Path for record", record_count, path, "length of data:", len_data

                if data[0] != "{":
                    # Data looks weird, should be JSON.
                    print self.label, "Warning: Found unexpected data for record", record_count, "in", raw_file, "path:", path, "data:"
                    print data
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    data = fileutil.to_unicode(data)

                current_bytes = len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH[
                    "v1"]
                bytes_read += current_bytes
                self.bytes_read += current_bytes
                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    print self.label, "Found an invalid path in record", record_count, path
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = data
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(
                            data, dims[-1])
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                print self.label, "Record", self.records_read, "mismatched dimension", i, dims[
                                    i], "!=", parsed_dims[i]
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims,
                                               data_version)
                        self.bytes_written += len(key) + len(
                            serialized_data) + 1
                        self.records_written += 1
                        # Compress rotated files as we generate them
                        if n.endswith(
                                StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data,
                                              str(e),
                                              "ERROR Writing to output file:")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, data, e.msg,
                                          "Bad Payload:")
                except Exception, e:
                    err_message = str(e)

                    # We don't need to write these bad records out - we know
                    # why they are being skipped.
                    if err_message != "Missing in payload: info.revision":
                        # TODO: recognize other common failure modes and handle them gracefully.
                        self.write_bad_record(key, dims, data, err_message,
                                              "Conversion Error:")
                        traceback.print_exc()

                if self.print_stats:
                    this_update = datetime.now()
                    sec = timer.delta_sec(self.last_update, this_update)
                    if sec > 10.0:
                        self.last_update = this_update
                        self.end_time = datetime.now()
                        self.dump_stats()
def main():
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("incoming_bucket",
                        help="The S3 bucket containing incoming files")
    parser.add_argument("publish_bucket",
                        help="The S3 bucket to save processed files")
    parser.add_argument("-n",
                        "--num-helpers",
                        metavar="N",
                        help="Start N helper processes",
                        type=int,
                        default=1)
    parser.add_argument("-k", "--aws-key", help="AWS Key", required=True)
    parser.add_argument("-s",
                        "--aws-secret-key",
                        help="AWS Secret Key",
                        required=True)
    parser.add_argument("-w",
                        "--work-dir",
                        help="Location to cache downloaded files",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base dir to store processed data",
                        required=True)
    parser.add_argument("-i",
                        "--input-files",
                        help="File containing a list of keys to process",
                        type=file)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Location of the desired telemetry schema",
                        required=True)
    args = parser.parse_args()

    # TODO: keep track of partial success so that subsequent runs are idempotent.

    start = datetime.now()
    conn = S3Connection(args.aws_key, args.aws_secret_key)
    incoming_bucket = conn.get_bucket(args.incoming_bucket)
    incoming_filenames = []
    if args.input_files:
        print "Fetching file list from file", args.input_files
        incoming_filenames = [l.strip() for l in args.input_files.readlines()]
    else:
        print "Fetching file list from S3..."
        for f in incoming_bucket.list():
            incoming_filenames.append(f.name)
    print "Done"

    for f in incoming_filenames:
        print "  ", f

    result = 0
    print "Downloading", len(incoming_filenames), "files..."
    result = fetch_s3_files(incoming_filenames, args.work_dir,
                            args.incoming_bucket, args.aws_key,
                            args.aws_secret_key)
    if result != 0:
        print "Error downloading files. Return code of s3funnel was", result
        return result
    print "Done"

    print "Splitting raw logs..."
    local_filenames = [
        os.path.join(args.work_dir, f) for f in incoming_filenames
    ]
    result = split_raw_logs(local_filenames, args.output_dir,
                            args.telemetry_schema)
    if result != 0:
        print "Error splitting logs. Return code was", result
        return result
    print "Done"

    print "Converting split logs..."
    result = convert_split_logs(args.output_dir)
    if result != 0:
        print "Error converting logs. Return code was", result
        return result
    print "Done"

    print "Exporting converted logs back to S3..."
    result = export_converted_logs(args.output_dir, args.publish_bucket,
                                   args.aws_key, args.aws_secret_key)
    if result != 0:
        print "Error exporting logs. Return code was", result
        return result
    print "Done"

    print "Removing processed logs from S3..."
    for f in incoming_filenames:
        print "  Deleting", f
        incoming_bucket.delete_key(f)
    print "Done"

    duration = timer.delta_sec(start)
    print "All done in %.2fs" % (duration)
    return 0
def main():
    args = get_args()
    logging.basicConfig()
    logger = logging.getLogger(__name__)
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logger.info("Expiring `flash_video` data older than {}.".format(args.expiry_date))
    logger.debug("Connecting to S3...")
    conn = S3Connection(args.aws_key, args.aws_secret_key)
    bucket = conn.get_bucket(args.bucket)

    connection_string = ""
    if hasattr(args, "db_name"):
        connection_string += "dbname={0} ".format(args.db_name)
    if hasattr(args, "db_host"):
        connection_string += "host={0} ".format(args.db_host)
    if hasattr(args, "db_port"):
        connection_string += "port={0} ".format(args.db_port)
    if hasattr(args, "db_user"):
        connection_string += "user={0} ".format(args.db_user)
    if hasattr(args, "db_pass"):
        connection_string += "password={0} ".format(args.db_pass)

    db_conn = None
    db_cursor = None
    if should_run(args.dry_run, logger, "Connecting to database"):
        db_conn = psycopg2.connect(connection_string)
        db_cursor = db_conn.cursor()

    prefix = args.prefix
    last_key = ''
    done = False
    total_count = 0
    exp_count = 0
    total_bytes = 0
    start_time = datetime.now()
    while not done:
        try:
            for k in bucket.list(prefix=prefix, marker=last_key):
                if k.name.endswith('/'):
                    logger.debug("Skipping directory '{}'".format(k.name))
                    continue
                total_count += 1
                if not should_expire(k.name, args.expiry_date, logger):
                    continue
                exp_count += 1
                total_bytes += k.size
                last_key = k.name
                if total_count % 100 == 0:
                    logger.debug("Expired {} of {} total files in {}s. Last key was {}".format(
                        exp_count, total_count, timer.delta_sec(start_time), last_key))
                logger.info("Deleting {} from S3 bucket".format(k.name))
                sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name)
                if should_run(args.dry_run, logger, "Deleting from S3 bucket"):
                    k.delete()

                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    db_cursor.execute(sql_update)
                    db_conn.commit()
                    logger.debug("Coordinator notified")
            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.debug("Continuing from last seen key: {}".format(last_key))
def main():
    parser = argparse.ArgumentParser(description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files")
    parser.add_argument("publish_bucket", help="The S3 bucket to save processed files")
    parser.add_argument("-n", "--num-helpers", metavar="N", help="Start N helper processes", type=int, default=1)
    parser.add_argument("-k", "--aws-key", help="AWS Key", required=True)
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True)
    parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True)
    parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True)
    parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file)
    parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True)
    args = parser.parse_args()

    # TODO: keep track of partial success so that subsequent runs are idempotent.

    start = datetime.now()
    conn = S3Connection(args.aws_key, args.aws_secret_key)
    incoming_bucket = conn.get_bucket(args.incoming_bucket)
    incoming_filenames = []
    if args.input_files:
        print "Fetching file list from file", args.input_files
        incoming_filenames = [ l.strip() for l in args.input_files.readlines() ]
    else:
        print "Fetching file list from S3..."
        for f in incoming_bucket.list():
            incoming_filenames.append(f.name)
    print "Done"

    for f in incoming_filenames:
        print "  ", f

    result = 0
    print "Downloading", len(incoming_filenames), "files..."
    result = fetch_s3_files(incoming_filenames, args.work_dir, args.incoming_bucket, args.aws_key, args.aws_secret_key)
    if result != 0:
        print "Error downloading files. Return code of s3funnel was", result
        return result
    print "Done"

    print "Splitting raw logs..."
    local_filenames = [os.path.join(args.work_dir, f) for f in incoming_filenames]
    result = split_raw_logs(local_filenames, args.output_dir, args.telemetry_schema)
    if result != 0:
        print "Error splitting logs. Return code was", result
        return result
    print "Done"

    print "Converting split logs..."
    result = convert_split_logs(args.output_dir)
    if result != 0:
        print "Error converting logs. Return code was", result
        return result
    print "Done"

    print "Exporting converted logs back to S3..."
    result = export_converted_logs(args.output_dir, args.publish_bucket, args.aws_key, args.aws_secret_key)
    if result != 0:
        print "Error exporting logs. Return code was", result
        return result
    print "Done"

    print "Removing processed logs from S3..."
    for f in incoming_filenames:
        print "  Deleting", f
        incoming_bucket.delete_key(f)
    print "Done"

    duration = timer.delta_sec(start)
    print "All done in %.2fs" % (duration)
    return 0
def main():
    parser = argparse.ArgumentParser(description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files")
    parser.add_argument("publish_bucket", help="The S3 bucket to save processed files")
    parser.add_argument("-k", "--aws-key", help="AWS Key", required=True)
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True)
    parser.add_argument("-r", "--aws-region", help="AWS Region", default="us-west-2")
    parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True)
    parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True)
    parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file)
    parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file")
    parser.add_argument("-q", "--queue", help="SQS Queue name to poll for incoming data")
    parser.add_argument("-c", "--histogram-cache-path", help="Path to store a local cache of histograms", default="./histogram_cache")
    parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True)
    parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000)
    parser.add_argument("-D", "--dry-run", help="Don't modify remote files", action="store_true")
    parser.add_argument("-C", "--skip-conversion", help="Skip validation/conversion of payloads", action="store_true")
    args = parser.parse_args()

    if not os.path.isfile(S3FUNNEL_PATH):
        print "ERROR: s3funnel not found at", S3FUNNEL_PATH
        print "You can get it from github: https://github.com/sstoiana/s3funnel"
        return -1

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    if args.skip_conversion:
        converter = None
    else:
        converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    num_cpus = multiprocessing.cpu_count()

    start = datetime.now()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    incoming_queue_messages = []

    if not args.dry_run:
        conn = S3Connection(args.aws_key, args.aws_secret_key)
        incoming_bucket = conn.get_bucket(args.incoming_bucket)

    incoming_filenames = []
    if args.queue is not None:
        print "Fetching file list from queue", args.queue
        if args.dry_run:
            print "Dry run mode... can't read from the queue without messing things up..."
        else:
            q_conn = boto.sqs.connect_to_region(args.aws_region,
                    aws_access_key_id=args.aws_key,
                    aws_secret_access_key=args.aws_secret_key)
            incoming_queue = q_conn.get_queue(args.queue)
            if incoming_queue is None:
                print "Error: could not get queue", args.queue
                return -2
            # Sometimes we don't get all the messages, even if more are
            # available, so keep trying until we have enough (or there aren't
            # any left)
            for i in range(num_cpus):
                messages = incoming_queue.get_messages(num_cpus - len(incoming_filenames))
                for m in messages:
                    # TODO: Make sure this file exists in S3 first?
                    possible_filename = m.get_body()
                    key = incoming_bucket.get_key(possible_filename)
                    if key is None:
                        print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename
                        # try to delete it:
                        incoming_queue.delete_message(m)
                    else:
                        incoming_filenames.append(possible_filename)
                        incoming_queue_messages.append(m)
                if len(messages) == 0 or len(incoming_filenames) >= num_cpus:
                    break
    elif args.input_files:
        print "Fetching file list from file", args.input_files
        incoming_filenames = [ l.strip() for l in args.input_files.readlines() ]
    else:
        print "Fetching file list from S3..."
        for f in incoming_bucket.list():
            incoming_filenames.append(f.name)
    print "Done"

    if len(incoming_filenames) == 0:
        print "Nothing to do!"
        return 0

    for f in incoming_filenames:
        print "  ", f

    print "Verifying that we can write to", args.publish_bucket
    if args.dry_run:
        print "Dry run mode: don't care!"
    else:
        try:
            publish_bucket = conn.get_bucket(args.publish_bucket)
            print "Looks good!"
        except S3ResponseError:
            print "Bucket", args.publish_bucket, "not found.  Attempting to create it."
            publish_bucket = conn.create_bucket(args.publish_bucket)

    result = 0
    print "Downloading", len(incoming_filenames), "files..."
    if args.dry_run:
        print "Dry run mode: skipping download from S3"
    else:
        result = fetch_s3_files(incoming_filenames, args.work_dir,
                incoming_bucket, args.aws_key, args.aws_secret_key)

    if result != 0:
        print "Error downloading files. Return code of s3funnel was", result
        return result
    print "Done"

    after_download = datetime.now()

    local_filenames = [os.path.join(args.work_dir, f) for f in incoming_filenames]

    # TODO: try a SimpleQueue
    raw_files = Queue()
    for l in local_filenames:
        raw_files.put(l)

    completed_files = Queue()
    compressed_files = Queue()

    # Begin reading raw input
    raw_readers = start_workers(num_cpus, "Reader", ReadRawStep, raw_files,
            (completed_files, schema, converter, storage, args.bad_data_log))

    # Tell readers when to stop:
    for i in range(num_cpus):
        raw_files.put(PipeStep.SENTINEL)

    # Compress completed files.
    compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep,
            completed_files, (compressed_files,))

    # Export compressed files to S3.
    exporters = start_workers(num_cpus, "Exporter", ExportCompressedStep,
            compressed_files, (args.output_dir, args.aws_key,
                args.aws_secret_key, args.publish_bucket, args.dry_run))

    wait_for(raw_readers, "Raw Readers")

    # `find <out_dir> -type f -not -name ".compressme"`
    # Add them to completed_files
    for root, dirs, files in os.walk(args.output_dir):
        for f in files:
            if f.endswith(".log"):
                completed_files.put(os.path.join(root, f))

    for i in range(num_cpus):
        completed_files.put(PipeStep.SENTINEL)

    wait_for(compressors, "Compressors")
    for i in range(num_cpus):
        compressed_files.put(PipeStep.SENTINEL)

    wait_for(exporters, "Exporters")

    print "Removing processed logs from S3..."
    for f in incoming_filenames:
        if args.dry_run:
            print "  Dry run, so not really deleting", f
        else:
            print "  Deleting", f
            incoming_bucket.delete_key(f)
            # Delete file locally too.
            os.remove(os.path.join(args.work_dir, f))
    print "Done"

    if len(incoming_queue_messages) > 0:
        print "Removing processed messages from SQS..."
        for m in incoming_queue_messages:
            if args.dry_run:
                print "  Dry run, so not really deleting", m.get_body()
            else:
                print "  Deleting", m.get_body()
                if incoming_queue.delete_message(m):
                    print "  Message deleted successfully"
                else:
                    print "  Failed to delete message :("
        print "Done"

    duration = timer.delta_sec(start)
    print "All done in %.2fs (%.2fs excluding download time)" % (duration, timer.delta_sec(after_download))
    return 0
                if empty_result or args.source_bucket != args.dest_bucket:
                    if should_run(args.dry_run, logger, "Deleting from source bucket"):
                        k.delete()
                else:
                    logger.info("Not deleting source: either non-empty or same bucket: {}".format(k.name))

                if sql_update is None:
                    logger.error("Missing sql_update :(")
                else:
                    logger.info(sql_update)
                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    # TODO
                    logger.debug("Should be actually notifying coordinator")

            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.info("Continuing from last seen key: {}".format(last_key))
    total_mb = round(total_bytes / 1024.0 / 1024.0, 2)
    logger.info("Total bytes: {}".format(total_bytes))
    logger.info(
        "Overall, listed {} files ({} MB) in {} seconds.".format(total_count, total_mb, timer.delta_sec(start_time))
    )
    return 0


if __name__ == "__main__":
    sys.exit(main())
def main():
    args = get_args()
    logging.basicConfig()
    logger = logging.getLogger(__name__)
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.WARNING)

    if not os.path.exists(args.work_dir):
        os.makedirs(args.work_dir)

    logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format(
        args.source_bucket, args.dest_bucket))
    logger.debug("Connecting to S3...")
    conn = S3Connection(args.aws_key, args.aws_secret_key)
    source_bucket = conn.get_bucket(args.source_bucket)
    dest_bucket = conn.get_bucket(args.dest_bucket)

    compress_cmd = [StorageLayout.COMPRESS_PATH
                    ] + StorageLayout.COMPRESSION_ARGS
    prefix = args.prefix
    last_key = ''
    done = False
    total_count = 0
    total_bytes = 0
    start_time = datetime.now()
    dupe_map = {}
    while not done:
        try:
            for k in source_bucket.list(prefix=prefix, marker=last_key):
                if k.name.endswith('/'):
                    logger.debug("Skipping directory '{}'".format(k.name))
                    continue
                if skip_by_date(k.name, args.min_date, logger):
                    logger.debug("Skipping file older than {}: {}".format(
                        args.min_date, k.name))
                    continue
                total_count += 1
                total_bytes += k.size
                last_key = k.name
                if total_count % 100 == 0:
                    logger.info(
                        "Looked at {} total records in {} seconds. Last key was {}"
                        .format(total_count, timer.delta_sec(start_time),
                                last_key))
                logger.debug("Fetching {} from source bucket".format(k.name))
                full_source_filename = os.path.join(args.work_dir, "__source",
                                                    k.name)
                full_dest_filename = os.path.join(args.work_dir, "__dest",
                                                  k.name)

                # Ensure that the necessary local dirs exist:
                for f in [full_source_filename, full_dest_filename]:
                    dirname = os.path.dirname(f)
                    if dirname != '' and not os.path.exists(dirname):
                        os.makedirs(dirname)
                logger.debug("Getting '{}' to '{}'".format(
                    k.name, full_source_filename))
                k.get_contents_to_filename(full_source_filename)

                logger.info("Removing pingIDs...")
                tmp_out_file = full_dest_filename + ".tmp"
                out_handle = open(tmp_out_file, "w")
                logger.debug("Uncompressing...")
                if full_source_filename.endswith(
                        StorageLayout.COMPRESSED_SUFFIX):
                    decompress_cmd = [StorageLayout.COMPRESS_PATH
                                      ] + StorageLayout.DECOMPRESSION_ARGS
                    raw_handle = open(full_source_filename, "rb")
                    # Popen the decompressing version of StorageLayout.COMPRESS_PATH
                    p_decompress = Popen(decompress_cmd,
                                         bufsize=65536,
                                         stdin=raw_handle,
                                         stdout=PIPE,
                                         stderr=sys.stderr)
                    handle = p_decompress.stdout
                else:
                    handle = open(full_source_filename, "r")
                    raw_handle = None

                logger.debug("Generating new pingIDs...")
                for line in handle:
                    # Lines are of the form <key><tab><json payload><newline>.
                    # Split on tab character to get the pieces.
                    key, payload = line.split(u"\t", 1)
                    # Replace key with a fresh UUID:
                    if key in dupe_map:
                        logger.info(
                            "Already saw key {}, skipping any more occurrences"
                            .format(key))
                    else:
                        new_key = str(uuid4())
                        dupe_map[key] = new_key
                        out_handle.write(u"%s\t%s" % (new_key, payload))

                handle.close()
                out_handle.close()
                if raw_handle:
                    raw_handle.close()

                sql_update = None
                empty_result = False
                if os.stat(tmp_out_file).st_size > 0:
                    logger.debug("Compressing new file...")
                    f_comp = open(full_dest_filename, "wb")
                    f_raw = open(tmp_out_file, "r", 1)
                    p_compress = Popen(compress_cmd,
                                       bufsize=65536,
                                       stdin=f_raw,
                                       stdout=f_comp,
                                       stderr=sys.stderr)
                    p_compress.communicate()
                    f_raw.close()
                    f_comp.close()
                    local_md5, size = fu.md5file(full_dest_filename)
                    sql_update = "UPDATE published_files SET " \
                          "file_md5 = '{0}', " \
                          "file_size = {1}, " \
                          "bucket_name = '{2}' " \
                          "WHERE file_name = '{3}';".format(local_md5, size,
                            dest_bucket.name, k.name)
                else:
                    # Don't upload empty files.
                    empty_result = True
                    sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(
                        k.name)
                    logger.debug(
                        "File was empty, skipping: {}".format(tmp_out_file))

                logger.info(
                    "Removing temp output file: {}".format(tmp_out_file))
                os.remove(tmp_out_file)

                if not empty_result and should_run(args.dry_run, logger,
                                                   "Uploading to dest bucket"):
                    dest_key = dest_bucket.new_key(k.name)
                    dest_key.set_contents_from_filename(full_dest_filename)
                    # Compare the md5 to be sure it succeeded.
                    dest_md5 = dest_key.etag[1:-1]
                    local_md5, size = fu.md5file(full_dest_filename)
                    if dest_md5 != local_md5:
                        raise Exception(
                            "Failed to upload {}".format(full_dest_filename))

                if should_run(
                        args.dry_run, logger, "Removing input file: {}".format(
                            full_source_filename)):
                    os.remove(full_source_filename)

                if not empty_result and should_run(
                        args.dry_run, logger,
                        "Removing output file: {}".format(full_dest_filename)):
                    os.remove(full_dest_filename)

                if empty_result or args.source_bucket != args.dest_bucket:
                    if should_run(args.dry_run, logger,
                                  "Deleting from source bucket"):
                        k.delete()
                else:
                    logger.info(
                        "Not deleting source: either non-empty or same bucket: {}"
                        .format(k.name))

                if sql_update is None:
                    logger.error("Missing sql_update :(")
                else:
                    logger.info(sql_update)
                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    #TODO
                    logger.debug("Should be actually notifying coordinator")

            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.info("Continuing from last seen key: {}".format(last_key))
def main():
    args = get_args()
    logging.basicConfig()
    logger = logging.getLogger(__name__)
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.WARNING)

    if not os.path.exists(args.work_dir):
        os.makedirs(args.work_dir)

    logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format(args.source_bucket, args.dest_bucket))
    logger.debug("Connecting to S3...")
    conn = S3Connection(args.aws_key, args.aws_secret_key)
    source_bucket = conn.get_bucket(args.source_bucket)
    dest_bucket = conn.get_bucket(args.dest_bucket)

    compress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.COMPRESSION_ARGS
    prefix = args.prefix
    last_key = ""
    done = False
    total_count = 0
    total_bytes = 0
    start_time = datetime.now()
    dupe_map = {}
    while not done:
        try:
            for k in source_bucket.list(prefix=prefix, marker=last_key):
                if k.name.endswith("/"):
                    logger.debug("Skipping directory '{}'".format(k.name))
                    continue
                if skip_by_date(k.name, args.min_date, logger):
                    logger.debug("Skipping file older than {}: {}".format(args.min_date, k.name))
                    continue
                total_count += 1
                total_bytes += k.size
                last_key = k.name
                if total_count % 100 == 0:
                    logger.info(
                        "Looked at {} total records in {} seconds. Last key was {}".format(
                            total_count, timer.delta_sec(start_time), last_key
                        )
                    )
                logger.debug("Fetching {} from source bucket".format(k.name))
                full_source_filename = os.path.join(args.work_dir, "__source", k.name)
                full_dest_filename = os.path.join(args.work_dir, "__dest", k.name)

                # Ensure that the necessary local dirs exist:
                for f in [full_source_filename, full_dest_filename]:
                    dirname = os.path.dirname(f)
                    if dirname != "" and not os.path.exists(dirname):
                        os.makedirs(dirname)
                logger.debug("Getting '{}' to '{}'".format(k.name, full_source_filename))
                k.get_contents_to_filename(full_source_filename)

                logger.info("Removing pingIDs...")
                tmp_out_file = full_dest_filename + ".tmp"
                out_handle = open(tmp_out_file, "w")
                logger.debug("Uncompressing...")
                if full_source_filename.endswith(StorageLayout.COMPRESSED_SUFFIX):
                    decompress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.DECOMPRESSION_ARGS
                    raw_handle = open(full_source_filename, "rb")
                    # Popen the decompressing version of StorageLayout.COMPRESS_PATH
                    p_decompress = Popen(
                        decompress_cmd, bufsize=65536, stdin=raw_handle, stdout=PIPE, stderr=sys.stderr
                    )
                    handle = p_decompress.stdout
                else:
                    handle = open(full_source_filename, "r")
                    raw_handle = None

                logger.debug("Generating new pingIDs...")
                for line in handle:
                    # Lines are of the form <key><tab><json payload><newline>.
                    # Split on tab character to get the pieces.
                    key, payload = line.split(u"\t", 1)
                    # Replace key with a fresh UUID:
                    if key in dupe_map:
                        logger.info("Already saw key {}, skipping any more occurrences".format(key))
                    else:
                        new_key = str(uuid4())
                        dupe_map[key] = new_key
                        out_handle.write(u"%s\t%s" % (new_key, payload))

                handle.close()
                out_handle.close()
                if raw_handle:
                    raw_handle.close()

                sql_update = None
                empty_result = False
                if os.stat(tmp_out_file).st_size > 0:
                    logger.debug("Compressing new file...")
                    f_comp = open(full_dest_filename, "wb")
                    f_raw = open(tmp_out_file, "r", 1)
                    p_compress = Popen(compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr)
                    p_compress.communicate()
                    f_raw.close()
                    f_comp.close()
                    local_md5, size = fu.md5file(full_dest_filename)
                    sql_update = (
                        "UPDATE published_files SET "
                        "file_md5 = '{0}', "
                        "file_size = {1}, "
                        "bucket_name = '{2}' "
                        "WHERE file_name = '{3}';".format(local_md5, size, dest_bucket.name, k.name)
                    )
                else:
                    # Don't upload empty files.
                    empty_result = True
                    sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name)
                    logger.debug("File was empty, skipping: {}".format(tmp_out_file))

                logger.info("Removing temp output file: {}".format(tmp_out_file))
                os.remove(tmp_out_file)

                if not empty_result and should_run(args.dry_run, logger, "Uploading to dest bucket"):
                    dest_key = dest_bucket.new_key(k.name)
                    dest_key.set_contents_from_filename(full_dest_filename)
                    # Compare the md5 to be sure it succeeded.
                    dest_md5 = dest_key.etag[1:-1]
                    local_md5, size = fu.md5file(full_dest_filename)
                    if dest_md5 != local_md5:
                        raise Exception("Failed to upload {}".format(full_dest_filename))

                if should_run(args.dry_run, logger, "Removing input file: {}".format(full_source_filename)):
                    os.remove(full_source_filename)

                if not empty_result and should_run(
                    args.dry_run, logger, "Removing output file: {}".format(full_dest_filename)
                ):
                    os.remove(full_dest_filename)

                if empty_result or args.source_bucket != args.dest_bucket:
                    if should_run(args.dry_run, logger, "Deleting from source bucket"):
                        k.delete()
                else:
                    logger.info("Not deleting source: either non-empty or same bucket: {}".format(k.name))

                if sql_update is None:
                    logger.error("Missing sql_update :(")
                else:
                    logger.info(sql_update)
                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    # TODO
                    logger.debug("Should be actually notifying coordinator")

            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.info("Continuing from last seen key: {}".format(last_key))