예제 #1
0
def chunk_input_data(data_path, lines_per_partition=500):
    """
    Create n partitions of 500 lines each
    :return:
    """
    partition = -1
    partition_paths = []
    partition_handlers = []

    fs = SimpleFileSystem()
    with fs.open(data_path, 'r') as f:
        for index, line in enumerate(f):
            if index % lines_per_partition == 0:
                # Clean up open descriptor
                if partition_handlers:
                    fs.close(partition_handlers[partition])

                # Create new partition
                partition += 1
                partition_paths.append(fs.get_writeable_file_path())
                partition_handlers.append(
                    fs.open(partition_paths[partition], 'w'))

            partition_handlers[partition].write(line)

    fs.close(partition_handlers[partition])

    return partition_paths
예제 #2
0
 def is_valid_file_path(self, path):
     """
     Check if the provided file path is valid
     :param path: the file path
     :return: boolean
     """
     sf = SimpleFileSystem()
     try:
         sf.close(sf.open(path, 'r'))
         return True
     except FileNotFoundError:
         return False
예제 #3
0
    def reduce(self):
        key_vals_map = {}
        fs = SimpleFileSystem()
        files = fs.get_partition_files(self.partition_num)

        for file_path in files:
            file = fs.open(file_path, 'r')
            for line in file:
                line = line.strip()
                key, value = line.split('\t')

                if (self.slow_mode):
                    time.sleep(0.001)

                if key in key_vals_map:
                    key_vals_map[key].append(value)
                else:
                    key_vals_map[key] = [value]

                self.progress += len(line)

        output = []
        for key, value in sorted(key_vals_map.items(), key=lambda pair: (hashcode(pair[0]) % self.num_workers, pair[0])):
            if (self.slow_mode):
                time.sleep(0.001)
            reducer = self.reducer_cls()
            reducer.reduce(key, value, output)
            self.progress += len(key)

        output_file = fs.open(fs.get_output_file(self.partition_num), 'w')
        for key, value in output:
            output_file.write('%s\t%s\n' % (key, value))
        fs.close(output_file)
예제 #4
0
    def map(self):
        """
        Simple count map
        """
        output = []
        for line in self.in_stream:
            # can set this higher or lower for slower speeds
            # this value makes speeds ~10 times slower
            if (self.slow_mode):
                time.sleep(0.001)
            self.mapper.map(self.key, line, output)
            self.progress += len(line)

        # Before spilling output to disk, mapper needs to quicksort based
        # on which partition the (key, value) pairs will be sent to
        # sort by (partitionIndex, key)
        output.sort(
            key=lambda pair: (hashcode(pair[0]) % self.num_workers, pair[0]))

        # Spill to disk
        partition_files = []
        sf = SimpleFileSystem()
        for i in range(self.num_workers):
            partition_files.append(sf.open(sf.get_mapper_output_file(i), 'w'))

        for key, value in output:
            partition_num = hashcode(key) % self.num_workers
            partition_files[partition_num].write('%s\t%s\n' % (key, value))

        for partition_file in partition_files:
            sf.close(partition_file)
예제 #5
0
파일: job.py 프로젝트: rhavens/PyMapReduce
def get_job_result_file_path(num_partitions):
    # For demo and testing purposes, we combine and sort final partitions here.
    # Note: this negates the distributed advantages so remove for real work
    sf = SimpleFileSystem()
    path = sf.get_writeable_file_path()
    path_sorted = sf.get_writeable_file_path()
    f = sf.open(path, 'w')

    for i in range(num_partitions):
        pf = sf.open(sf.get_file_with_name('partition_{}'.format(i)), 'r')
        f.write(pf.read())
        pf.close()

    sf.close(f)

    os.system('cat {} | sort -k1,1 > {}'.format(path, path_sorted))

    return path_sorted
예제 #6
0
    def assign_job(self, conn, job):
        # hacky workaround because mapper reads file from data_path and reducer
        # reads files (plural) from a partition directory
        if (job.data_path is None):
            conn.chunk_size = sum([
                os.path.getsize(file) for file in
                SimpleFileSystem().get_partition_files(job.partition_num)
            ])
        else:
            conn.chunk_size = os.path.getsize(job.data_path)

        job.pre_execute()
        job.client = conn
        job.pending_assignment = True
        conn.current_job = job
        conn.send_message(JobReadyMessage(str(job.id)))
예제 #7
0
def chunk_input_data_by_size_and_workers(data_path, n_workers):
    """
    Create n_workers partitions of roughly equal number of bytes
    :return:
    """
    partition = -1
    partition_paths = []
    partition_handlers = []

    # TODO main server should be aware of these values as they impact the
    # progress measure
    file_size = os.path.getsize(data_path)
    # round up since rounding down can cause more chunks than workers (very bad)
    chunk_size = ceil(file_size / (1.0 * n_workers))

    fs = SimpleFileSystem()
    bytes_chunked = inf  # immediately create partition
    with fs.open(data_path, 'r') as f:
        for line in f:
            if bytes_chunked >= chunk_size:
                # Clean up open descriptor
                if partition_handlers:
                    fs.close(partition_handlers[partition])

                # Create new partition
                partition += 1
                partition_paths.append(fs.get_writeable_file_path())
                partition_handlers.append(
                    fs.open(partition_paths[partition], 'w'))

                bytes_chunked = 0

            partition_handlers[partition].write(line)
            bytes_chunked += len(
                line)  # assuming char is 1 byte (hopefully a safe assumption)

    fs.close(partition_handlers[partition])

    return partition_paths
예제 #8
0
    def initialize_job(self, submitter, mapper_name, reducer_name,
                       data_file_path):
        """

        :return:
        """
        self.job_submitter_connection = submitter
        self.job_started = True
        self.mapping = True
        self.reducing = False
        self.mapper_name = mapper_name
        self.reducer_name = reducer_name
        # equal partitions distributed based off number of workers
        self.num_workers = self.get_num_subscribed_workers()

        # monitor utilization of worker resources during job
        self.begin_monitor_job_efficiency()

        SimpleFileSystem().clean_directories()
        setup_mapping_tasks(data_file_path, mapper_name, self.num_workers,
                            self.sub_jobs, self.get_next_job_id)
예제 #9
0
    def do_processing(self):
        if len(self.message_read_queue):
            message = self.message_read_queue.pop(0)

            if message.m_type is MessageTypes.JOB_READY:
                if not self.has_job:
                    self.has_job = True
                    self.message_write_queue.append(JobReadyToReceiveMessage())

            elif message.m_type is MessageTypes.JOB_INSTRUCTIONS_FILE:
                self.instructions_file = JobInstructionsFileMessage.get_path_from_message(
                    message)
                self.instructions_type = JobInstructionsFileMessage.get_type_from_message(
                    message)
                self.num_workers = JobInstructionsFileMessage.get_num_workers_from_message(
                    message)
                self.partition_num = JobInstructionsFileMessage.get_partition_num_from_message(
                    message)
            elif message.m_type is MessageTypes.DATAFILE:
                self.data_path = message.get_body()
            elif message.m_type is MessageTypes.JOB_START:
                if not self.ready_to_start():
                    return

                # Start job
                fs = SimpleFileSystem()

                pkg = importlib.import_module(self.instructions_file)
                instructions_class = getattr(pkg, self.instructions_type)

                in_file = None
                if self.data_path:
                    in_file = fs.open(self.data_path, 'r')

                if self.instructions_type == 'Mapper':
                    # pass instruction class to mapper
                    task = Mapper(self.data_path,
                                  instructions_class,
                                  self.num_workers,
                                  in_stream=in_file,
                                  slow_mode=self.slow_mode)
                elif self.instructions_type == 'Reducer':
                    # pass instruction class to reducer
                    task = Reducer(instructions_class,
                                   self.num_workers,
                                   self.partition_num,
                                   slow_mode=self.slow_mode)

                # beat method will send status reports to the server
                # on a separate thread to avoid blocking during the
                # actual map/reduce
                task.SetBeatMethod(lambda: [
                    self.connection.send_message(
                        JobHeartbeatMessage(
                            str(task.progress),
                            str(task.progress /
                                (time.time() - task.start_time)))),
                    self.connection.write(),
                ])
                # completion actions upon finishing map/reduce steps
                # this doesn't actually have to be on a different thread
                task.SetDieMethod(lambda: [
                    self.message_write_queue.append(JobDoneMessage()),
                    self.prep_for_new_job()
                ])

                task.run()

                if in_file:
                    fs.close(in_file)