def chunk_input_data(data_path, lines_per_partition=500): """ Create n partitions of 500 lines each :return: """ partition = -1 partition_paths = [] partition_handlers = [] fs = SimpleFileSystem() with fs.open(data_path, 'r') as f: for index, line in enumerate(f): if index % lines_per_partition == 0: # Clean up open descriptor if partition_handlers: fs.close(partition_handlers[partition]) # Create new partition partition += 1 partition_paths.append(fs.get_writeable_file_path()) partition_handlers.append( fs.open(partition_paths[partition], 'w')) partition_handlers[partition].write(line) fs.close(partition_handlers[partition]) return partition_paths
def is_valid_file_path(self, path): """ Check if the provided file path is valid :param path: the file path :return: boolean """ sf = SimpleFileSystem() try: sf.close(sf.open(path, 'r')) return True except FileNotFoundError: return False
def reduce(self): key_vals_map = {} fs = SimpleFileSystem() files = fs.get_partition_files(self.partition_num) for file_path in files: file = fs.open(file_path, 'r') for line in file: line = line.strip() key, value = line.split('\t') if (self.slow_mode): time.sleep(0.001) if key in key_vals_map: key_vals_map[key].append(value) else: key_vals_map[key] = [value] self.progress += len(line) output = [] for key, value in sorted(key_vals_map.items(), key=lambda pair: (hashcode(pair[0]) % self.num_workers, pair[0])): if (self.slow_mode): time.sleep(0.001) reducer = self.reducer_cls() reducer.reduce(key, value, output) self.progress += len(key) output_file = fs.open(fs.get_output_file(self.partition_num), 'w') for key, value in output: output_file.write('%s\t%s\n' % (key, value)) fs.close(output_file)
def map(self): """ Simple count map """ output = [] for line in self.in_stream: # can set this higher or lower for slower speeds # this value makes speeds ~10 times slower if (self.slow_mode): time.sleep(0.001) self.mapper.map(self.key, line, output) self.progress += len(line) # Before spilling output to disk, mapper needs to quicksort based # on which partition the (key, value) pairs will be sent to # sort by (partitionIndex, key) output.sort( key=lambda pair: (hashcode(pair[0]) % self.num_workers, pair[0])) # Spill to disk partition_files = [] sf = SimpleFileSystem() for i in range(self.num_workers): partition_files.append(sf.open(sf.get_mapper_output_file(i), 'w')) for key, value in output: partition_num = hashcode(key) % self.num_workers partition_files[partition_num].write('%s\t%s\n' % (key, value)) for partition_file in partition_files: sf.close(partition_file)
def get_job_result_file_path(num_partitions): # For demo and testing purposes, we combine and sort final partitions here. # Note: this negates the distributed advantages so remove for real work sf = SimpleFileSystem() path = sf.get_writeable_file_path() path_sorted = sf.get_writeable_file_path() f = sf.open(path, 'w') for i in range(num_partitions): pf = sf.open(sf.get_file_with_name('partition_{}'.format(i)), 'r') f.write(pf.read()) pf.close() sf.close(f) os.system('cat {} | sort -k1,1 > {}'.format(path, path_sorted)) return path_sorted
def assign_job(self, conn, job): # hacky workaround because mapper reads file from data_path and reducer # reads files (plural) from a partition directory if (job.data_path is None): conn.chunk_size = sum([ os.path.getsize(file) for file in SimpleFileSystem().get_partition_files(job.partition_num) ]) else: conn.chunk_size = os.path.getsize(job.data_path) job.pre_execute() job.client = conn job.pending_assignment = True conn.current_job = job conn.send_message(JobReadyMessage(str(job.id)))
def chunk_input_data_by_size_and_workers(data_path, n_workers): """ Create n_workers partitions of roughly equal number of bytes :return: """ partition = -1 partition_paths = [] partition_handlers = [] # TODO main server should be aware of these values as they impact the # progress measure file_size = os.path.getsize(data_path) # round up since rounding down can cause more chunks than workers (very bad) chunk_size = ceil(file_size / (1.0 * n_workers)) fs = SimpleFileSystem() bytes_chunked = inf # immediately create partition with fs.open(data_path, 'r') as f: for line in f: if bytes_chunked >= chunk_size: # Clean up open descriptor if partition_handlers: fs.close(partition_handlers[partition]) # Create new partition partition += 1 partition_paths.append(fs.get_writeable_file_path()) partition_handlers.append( fs.open(partition_paths[partition], 'w')) bytes_chunked = 0 partition_handlers[partition].write(line) bytes_chunked += len( line) # assuming char is 1 byte (hopefully a safe assumption) fs.close(partition_handlers[partition]) return partition_paths
def initialize_job(self, submitter, mapper_name, reducer_name, data_file_path): """ :return: """ self.job_submitter_connection = submitter self.job_started = True self.mapping = True self.reducing = False self.mapper_name = mapper_name self.reducer_name = reducer_name # equal partitions distributed based off number of workers self.num_workers = self.get_num_subscribed_workers() # monitor utilization of worker resources during job self.begin_monitor_job_efficiency() SimpleFileSystem().clean_directories() setup_mapping_tasks(data_file_path, mapper_name, self.num_workers, self.sub_jobs, self.get_next_job_id)
def do_processing(self): if len(self.message_read_queue): message = self.message_read_queue.pop(0) if message.m_type is MessageTypes.JOB_READY: if not self.has_job: self.has_job = True self.message_write_queue.append(JobReadyToReceiveMessage()) elif message.m_type is MessageTypes.JOB_INSTRUCTIONS_FILE: self.instructions_file = JobInstructionsFileMessage.get_path_from_message( message) self.instructions_type = JobInstructionsFileMessage.get_type_from_message( message) self.num_workers = JobInstructionsFileMessage.get_num_workers_from_message( message) self.partition_num = JobInstructionsFileMessage.get_partition_num_from_message( message) elif message.m_type is MessageTypes.DATAFILE: self.data_path = message.get_body() elif message.m_type is MessageTypes.JOB_START: if not self.ready_to_start(): return # Start job fs = SimpleFileSystem() pkg = importlib.import_module(self.instructions_file) instructions_class = getattr(pkg, self.instructions_type) in_file = None if self.data_path: in_file = fs.open(self.data_path, 'r') if self.instructions_type == 'Mapper': # pass instruction class to mapper task = Mapper(self.data_path, instructions_class, self.num_workers, in_stream=in_file, slow_mode=self.slow_mode) elif self.instructions_type == 'Reducer': # pass instruction class to reducer task = Reducer(instructions_class, self.num_workers, self.partition_num, slow_mode=self.slow_mode) # beat method will send status reports to the server # on a separate thread to avoid blocking during the # actual map/reduce task.SetBeatMethod(lambda: [ self.connection.send_message( JobHeartbeatMessage( str(task.progress), str(task.progress / (time.time() - task.start_time)))), self.connection.write(), ]) # completion actions upon finishing map/reduce steps # this doesn't actually have to be on a different thread task.SetDieMethod(lambda: [ self.message_write_queue.append(JobDoneMessage()), self.prep_for_new_job() ]) task.run() if in_file: fs.close(in_file)