class MRJob(object): INPUT_PROTOCOL = protocol.JSONValueProtocol INTERNAL_PROTOCOL = protocol.JSONProtocol OUTPUT_PROTOCOL = protocol.JSONValueProtocol def __init__(self, counters=None): self._settings = self.settings() self._steps = self.steps() self._counters = MRCounter() @classmethod def run(cls): mapreduce(cls) @abstractmethod def steps(self): """define steps necessary to run the job""" @abstractmethod def settings(self): """define settings""" def increment_counter(self, group, counter, amount=1): self._counters.incr(group, counter, amount) def get_step(self, step_idx): return self.steps()[step_idx]
def combine_counters(work_dir, n_map_shards, n_reduce_shards): filenames = map(lambda (work_dir, shard): os.path.join(work_dir, 'map-%d.counters' % shard), zip([work_dir] * n_map_shards, range(n_map_shards))) filenames += map(lambda (work_dir, shard): os.path.join(work_dir, 'combine-%d.counters' % shard), zip([work_dir] * n_map_shards, range(n_map_shards))) filenames += map(lambda (work_dir, shard): os.path.join(work_dir, 'reduce-%d.counters' % shard), zip([work_dir] * n_reduce_shards, range(n_reduce_shards))) return MRCounter.sum( imap(MRCounter.deserialize, read_files(filter(os.path.exists, filenames))))
def __init__(self, counters=None): self._settings = self.settings() self._steps = self.steps() self._counters = MRCounter()