예제 #1
0
def reload_read_request_queues(job_description_file, job_ids, redis_host,
                               redis_port, redis_db, skip_phase_zero,
                               skip_phase_one, phase_zero_sample_size):

    with open(job_description_file, 'r') as fp:
        job_description = json.load(fp)

    coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port,
                                               redis_db)

    input_files = input_file_utils.gather_input_file_paths(
        coordinator_db, job_description["input_directory"])

    phases = []

    if not skip_phase_zero:
        phases.append(0)

    if not skip_phase_one:
        phases.append(1)

    read_requests = input_file_utils.generate_read_requests(
        input_files, phase_zero_sample_size, job_ids, phases)

    input_file_utils.load_read_requests(coordinator_db, read_requests)
def reload_read_request_queues(
    job_description_file, job_ids, redis_host, redis_port, redis_db,
    skip_phase_zero, skip_phase_one, phase_zero_sample_size):

    with open(job_description_file, 'r') as fp:
        job_description = json.load(fp)

    coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db)

    input_files = input_file_utils.gather_input_file_paths(
        coordinator_db, job_description["input_directory"])

    phases = []

    if not skip_phase_zero:
        phases.append(0)

    if not skip_phase_one:
        phases.append(1)

    read_requests = input_file_utils.generate_read_requests(
        input_files, phase_zero_sample_size, job_ids, phases)

    input_file_utils.load_read_requests(coordinator_db, read_requests)
    def run_batch(self, batch_jobs, batch_inputs):
        batch_id = self.coordinator_db.next_batch_id

        log.info("Running batch %d with the following job(s): %s" %
                 (batch_id, ', '.join(map(str, batch_jobs))))


        # Create log directory for the current batch
        batch_logs = create_batch_directory(self.log_directory, batch_id)

        # Copy description files to the log directory
        description_dir = os.path.join(
            os.path.dirname(__file__), os.pardir, os.pardir, os.pardir,
            "tritonsort", "mapreduce", "description")
        shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs)
        shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs)

        # Copy config file to log directory
        shutil.copy(self.config_file, batch_logs)

        self.ready_for_next_batch = False

        # Pull out relevant phase zero parameters
        phase_zero_sample_rate = 1 # Sample 100% by default
        if "SAMPLE_RATE" in self.config:
            phase_zero_sample_rate = float(self.config["SAMPLE_RATE"])
        phase_zero_sample_points_per_file = 1 # Sample prefixes by default
        if "SAMPLES_PER_FILE" in self.config:
            phase_zero_sample_points_per_file = \
                int(self.config["SAMPLES_PER_FILE"])
        fixed_key_length = None
        if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config:
            fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"])
        fixed_value_length = None
        if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config:
            fixed_value_length = \
                int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"])

        # If the application config file (yaml) or the job spec file (json)
        # skips a phase, we should not load read requests for that phase. The
        # job spec file should override the application config file.
        skip_phase_zero = 0
        skip_phase_one = 0
        skip_phase_two = 0
        skip_phase_three = 0
        if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]:
            skip_phase_zero = 1
        if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]:
            skip_phase_one = 1
        if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]:
            skip_phase_two = 1
        if "SKIP_PHASE_THREE" in self.config and \
                self.config["SKIP_PHASE_THREE"]:
            skip_phase_three = 1

        # The run_job.py script verifies that all jobs in the batch have the
        # same value of these skip parameters in the job specs, so we can just
        # check the first one.
        for key, value in (
            self.coordinator_db.job_params(batch_jobs[0]).items()):
            if key == "SKIP_PHASE_ZERO":
                skip_phase_zero = value
            if key == "SKIP_PHASE_ONE":
                skip_phase_one = value
            if key == "SKIP_PHASE_TWO":
                skip_phase_two = value
            if key == "SKIP_PHASE_THREE":
                skip_phase_three = value
            if key == "MAP_INPUT_FIXED_KEY_LENGTH":
                fixed_key_length = int(value)
            if key == "MAP_INPUT_FIXED_VALUE_LENGTH":
                fixed_value_length = int(value)

        fixed_tuple_length = None
        if fixed_key_length != None and fixed_value_length != None:
            fixed_tuple_length = fixed_key_length + fixed_value_length

        use_replication = False
        if "OUTPUT_REPLICATION_LEVEL" in self.config and \
                int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1:
            use_replication = True

        phases = []
        if not skip_phase_zero:
            phases.append(0)
        if not skip_phase_one:
            phases.append(1)
        if not skip_phase_two and use_replication:
            # If we're using replication, phase two will have network transfer,
            # use barriers to guarantee sockets are connected.
            phases.append(2)
        if not skip_phase_three and use_replication:
            # If we're using replication, phase three will have network
            # transfer, use barriers to guarantee sockets are connected.
            phases.append(3)

        # Setup barriers
        self.coordinator_db.create_barriers(phases, batch_id, batch_jobs)

        # Generate read requests for the jobs in the batch
        read_requests = generate_read_requests(
            job_inputs = batch_inputs,
            phase_zero_sample_rate = phase_zero_sample_rate,
            phase_zero_sample_points_per_file =\
                phase_zero_sample_points_per_file,
            tuple_start_offset = fixed_tuple_length,
            job_ids = batch_jobs, phases=phases)

        # Load read requests into read request queue for each worker
        load_read_requests(self.coordinator_db, read_requests)

        start_time = time.time()
        # Mark phase zero as starting now.
        self.coordinator_db.begin_phase(batch_id, "phase_zero")
        self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time)
        log.info("Running phase_zero...")
        print_keyboard_commands()

        for job_id in batch_jobs:
            self.coordinator_db.update_job_status(
                job_id, { "start_time" : str(start_time),
                          "batch_id" : batch_id,
                          "date" : time.asctime()})

        self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs)

        self.coordinator_db.mark_batch_incomplete(batch_id)

        # Setting current_batch will cause all node coordinators to start work
        # on that batch
        self.coordinator_db.add_batch_to_node_coordinator_batch_queues(batch_id)
    def run_batch(self, batch_jobs, batch_inputs):
        batch_id = self.coordinator_db.next_batch_id

        log.info("Running batch %d with the following job(s): %s" %
                 (batch_id, ', '.join(map(str, batch_jobs))))

        # Create log directory for the current batch
        batch_logs = create_batch_directory(self.log_directory, batch_id)

        # Copy description files to the log directory
        description_dir = os.path.join(os.path.dirname(__file__), os.pardir,
                                       os.pardir, os.pardir, "tritonsort",
                                       "mapreduce", "description")
        shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs)
        shutil.copy(os.path.join(description_dir, "structure.json"),
                    batch_logs)

        # Copy config file to log directory
        shutil.copy(self.config_file, batch_logs)

        self.ready_for_next_batch = False

        # Pull out relevant phase zero parameters
        phase_zero_sample_rate = 1  # Sample 100% by default
        if "SAMPLE_RATE" in self.config:
            phase_zero_sample_rate = float(self.config["SAMPLE_RATE"])
        phase_zero_sample_points_per_file = 1  # Sample prefixes by default
        if "SAMPLES_PER_FILE" in self.config:
            phase_zero_sample_points_per_file = \
                int(self.config["SAMPLES_PER_FILE"])
        fixed_key_length = None
        if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config:
            fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"])
        fixed_value_length = None
        if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config:
            fixed_value_length = \
                int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"])

        # If the application config file (yaml) or the job spec file (json)
        # skips a phase, we should not load read requests for that phase. The
        # job spec file should override the application config file.
        skip_phase_zero = 0
        skip_phase_one = 0
        skip_phase_two = 0
        skip_phase_three = 0
        if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]:
            skip_phase_zero = 1
        if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]:
            skip_phase_one = 1
        if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]:
            skip_phase_two = 1
        if "SKIP_PHASE_THREE" in self.config and \
                self.config["SKIP_PHASE_THREE"]:
            skip_phase_three = 1

        # The run_job.py script verifies that all jobs in the batch have the
        # same value of these skip parameters in the job specs, so we can just
        # check the first one.
        for key, value in (self.coordinator_db.job_params(
                batch_jobs[0]).items()):
            if key == "SKIP_PHASE_ZERO":
                skip_phase_zero = value
            if key == "SKIP_PHASE_ONE":
                skip_phase_one = value
            if key == "SKIP_PHASE_TWO":
                skip_phase_two = value
            if key == "SKIP_PHASE_THREE":
                skip_phase_three = value
            if key == "MAP_INPUT_FIXED_KEY_LENGTH":
                fixed_key_length = int(value)
            if key == "MAP_INPUT_FIXED_VALUE_LENGTH":
                fixed_value_length = int(value)

        fixed_tuple_length = None
        if fixed_key_length != None and fixed_value_length != None:
            fixed_tuple_length = fixed_key_length + fixed_value_length

        use_replication = False
        if "OUTPUT_REPLICATION_LEVEL" in self.config and \
                int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1:
            use_replication = True

        phases = []
        if not skip_phase_zero:
            phases.append(0)
        if not skip_phase_one:
            phases.append(1)
        if not skip_phase_two and use_replication:
            # If we're using replication, phase two will have network transfer,
            # use barriers to guarantee sockets are connected.
            phases.append(2)
        if not skip_phase_three and use_replication:
            # If we're using replication, phase three will have network
            # transfer, use barriers to guarantee sockets are connected.
            phases.append(3)

        # Setup barriers
        self.coordinator_db.create_barriers(phases, batch_id, batch_jobs)

        # Generate read requests for the jobs in the batch
        read_requests = generate_read_requests(
            job_inputs = batch_inputs,
            phase_zero_sample_rate = phase_zero_sample_rate,
            phase_zero_sample_points_per_file =\
                phase_zero_sample_points_per_file,
            tuple_start_offset = fixed_tuple_length,
            job_ids = batch_jobs, phases=phases)

        # Load read requests into read request queue for each worker
        load_read_requests(self.coordinator_db, read_requests)

        start_time = time.time()
        # Mark phase zero as starting now.
        self.coordinator_db.begin_phase(batch_id, "phase_zero")
        self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time)
        log.info("Running phase_zero...")
        print_keyboard_commands()

        for job_id in batch_jobs:
            self.coordinator_db.update_job_status(
                job_id, {
                    "start_time": str(start_time),
                    "batch_id": batch_id,
                    "date": time.asctime()
                })

        self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs)

        self.coordinator_db.mark_batch_incomplete(batch_id)

        # Setting current_batch will cause all node coordinators to start work
        # on that batch
        self.coordinator_db.add_batch_to_node_coordinator_batch_queues(
            batch_id)