def parse_file(file_name, cores, jobs_per_core, stats): file_size = os.path.getsize(file_name) chunks = int(cores * jobs_per_core) stats.initial_report(file_name, file_size, chunks, cores) queue = pprocess.Queue(limit=cores) parse_chunk_async = queue.manage(pprocess.MakeParallel(parse_chunk)) temp_dir = tempfile.mkdtemp('.wf2') try: for (start, end) in file_offsets(file_size, chunks): parse_chunk_async(file_name, temp_dir, start, end) total = dict( (count_name, LogCounter(count_name)) for count_name in count_names ) stats.waiting() for (temp_file_name, job_pid, job_time) in queue: stats.received_job_result() start_reduce_time = time.time() mapper = pickle.load(open(temp_file_name, 'rb')) for (count_name, counter) in mapper.get_counters().iteritems(): total[count_name].add_counter(counter) os.remove(temp_file_name) stats.job_report(job_pid, job_time, time.time() - start_reduce_time) stats.waiting() finally: shutil.rmtree(temp_dir) for name in count_names: print total[name].report()
def parse_file(file_name, cores, jobs_per_core, stats): file_size = os.path.getsize(file_name) chunks = int(cores * jobs_per_core) queue = pprocess.Queue(limit=cores) parse_chunk_async = queue.manage(pprocess.MakeParallel(parse_chunk)) temp_dir = tempfile.mkdtemp('.wf2') mappers = {} for count_name in count_names: mappers[count_name] = pprocess.start(map_count, count_name, temp_dir) c = 0 for (start, end) in file_offsets(file_size, chunks): c += 1 parse_chunk_async(file_name, temp_dir, start, end, c) stats._output('all map jobs queued') for job_id in queue: start_reduce_time = time.time() stats._output('map job finished: pid=%d' % job_id) for c in range(len(count_names)): mappers[count_names[c]].send(job_id) stats._output('all map jobs finished') for (mapper, count_name) in ((mappers[count_name], count_name) for count_name in count_names): mapper.send(None) print pickle.load(open(mapper.receive(), 'rb')) stats._output('reduce job finished: name=%s' % count_name) shutil.rmtree(temp_dir)
def parse_file(file_name, cores, jobs_per_core, stats): file_size = os.path.getsize(file_name) chunks = int(cores * jobs_per_core) stats.initial_report(file_name, file_size, chunks, cores) queue = pprocess.Queue(limit=cores) parse_chunk_async = queue.manage(pprocess.MakeParallel(parse_chunk)) for (start, end) in file_offsets(file_size, chunks): parse_chunk_async(file_name, start, end) total = dict( (count_name, LogCounter(count_name)) for count_name in count_names ) stats.waiting() for (mapper, job_pid, job_time) in queue: stats.received_job_result() start_reduce_time = time.time() for (count_name, counter) in mapper.get_counters().iteritems(): total[count_name].add_counter(counter) stats.job_report(job_pid, job_time, time.time() - start_reduce_time) stats.waiting() for name in count_names: print total[name].report()
def read_seek(): for (start, end) in file_offsets(os.path.getsize(file_name), n_pieces): # print "reading piece: %d -> %d" % (start, end) for line in seek_open(file_name, start, end): yield line