def test_map_combiner_reduce(self): factory = TFactory(combiner=TReducer) sas = SortAndShuffle() run_task(factory, istream=self.stream, ostream=sas) with self._mkf('foo_map_combiner_reduce.out') as o: run_task(factory, istream=sas, ostream=o, private_encoding=False) self.check_result('foo_map_combiner_reduce.out', STREAM_1)
def test_timer(self): factory = TFactory(mapper=SleepingMapper) exp_count = { 'registerCounter': 1, 'incrementCounter': Counter( [_[0] for _ in STREAM_1] )[TextWriter.MAP_ITEM] } with self._mkf('foo_map_only.out') as o: run_task(factory, istream=self.stream1, ostream=o) self.check_counts(o.name, exp_count)
def test_map_only(self): factory = TFactory() fname = self._mkfn('foo_map_only.out') with open(fname, 'w') as o: run_task(factory, istream=self.stream1, ostream=o) exp_count = { 'done': 1, 'progress': 1, 'output': sum(len(_[2].split()) for _ in STREAM_1 if _[0] is TextWriter.MAP_ITEM) } self.check_counts(fname, exp_count)
def test_timer(self): factory = TFactory(mapper=SleepingMapper) with self._mkf('foo_map_only.out') as o: run_task(factory, istream=self.stream1, ostream=o) count = Counter() with open(o.name) as f: for line in f: count[line.strip().split('\t', 1)[0]] += 1 exp_count = { 'registerCounter': 2, 'incrementCounter': 2 * Counter([_[0] for _ in STREAM_1])['mapItem'] } for k, v in exp_count.iteritems(): self.assertTrue(k in count) self.assertEqual(count[k], v)
def __run_test(self, mode, mapper_class, context_class): cmd_file = self.__write_cmd_file(mode) pp.run_task( pp.Factory(mapper_class=mapper_class), private_encoding=False, context_class=context_class, cmd_file=cmd_file) out_fn = cmd_file + '.out' out_records = [] with open(out_fn, 'rb') as f: bf = BinaryDownStreamAdapter(f) for cmd, args in bf: if cmd == bf.OUTPUT: name, color = args out_records.append({'name': name, 'favorite_color': color}) self.assertEqual(len(out_records), len(self.records)) for out_r, r in zip(out_records, self.records): for k, v in iteritems(out_r): self.assertEqual(v.decode('UTF-8'), r[k])
def __run_test(self, mode, mapper_class, context_class): cmd_file = self.__write_cmd_file(mode) pp.run_task( pp.Factory(mapper_class=mapper_class), private_encoding=False, context_class=context_class, cmd_file=cmd_file ) out_fn = cmd_file + '.out' out_records = [] with open(out_fn) as ostream: for cmd, args in BinaryDownStreamFilter(ostream): if cmd == 'output': name, color = args out_records.append({'name': name, 'favorite_color': color}) self.assertEqual(len(out_records), len(self.records)) for out_r, r in zip(out_records, self.records): for k, v in out_r.iteritems(): self.assertEqual(v, r[k])
def _test_map_reduce_with_private_encoding_helper(self, factory, fast_combiner=False): self.stream3.close() cmd_file = self.stream3.name out_file = cmd_file + '.out' reduce_infile = cmd_file + '.reduce' reduce_outfile = reduce_infile + '.out' run_task(factory, cmd_file=cmd_file, private_encoding=True, fast_combiner=fast_combiner) data = {} bw = BinaryWriter with open(out_file, 'rb') as f: bf = BinaryDownStreamAdapter(f) for cmd, args in bf: if cmd == bw.OUTPUT: data.setdefault(args[0], []).append(args[1]) stream = [] stream.append((bw.START_MESSAGE, 0)) stream.append((bw.SET_JOB_CONF, 'key1', 'value1', 'key2', 'value2')) stream.append((bw.RUN_REDUCE, 0, 0)) for k in data: stream.append((bw.REDUCE_KEY, k)) for v in data[k]: stream.append((bw.REDUCE_VALUE, v)) stream.append((bw.CLOSE,)) binary_stream_writer(reduce_infile, stream) run_task(factory, cmd_file=reduce_infile, private_encoding=True) with open(reduce_outfile, 'rb') as f: with self._mkf('foo.out', mode='w') as o: bf = BinaryUpStreamDecoder(f) for cmd, args in bf: if cmd == bw.PROGRESS: o.write('progress\t%s\n' % args[0]) elif cmd == bw.OUTPUT: o.write('output\t%s\n' % '\t'.join([x.decode('utf-8') for x in args])) elif cmd == bw.DONE: o.write('done\n') self.check_result('foo.out', STREAM_2)
def _test_map_reduce_with_private_encoding_helper(self, factory, fast_combiner=False): self.stream3.close() cmd_file = self.stream3.name out_file = cmd_file + '.out' reduce_infile = cmd_file + '.reduce' reduce_outfile = reduce_infile + '.out' run_task(factory, cmd_file=cmd_file, private_encoding=True, fast_combiner=fast_combiner) data = {} with open(out_file) as f: bf = BinaryDownStreamFilter(f) for cmd, args in bf: if cmd == 'output': data.setdefault(args[0], []).append(args[1]) stream = [] stream.append(('start', 0)) stream.append(('setJobConf', ('key1', 'value1', 'key2', 'value2'))) stream.append(('runReduce', 0, False)) for k in data: stream.append(('reduceKey', k)) for v in data[k]: stream.append(('reduceValue', v)) stream.append(('close',)) binary_stream_writer(reduce_infile, stream) run_task(factory, cmd_file=reduce_infile, private_encoding=True) with open(reduce_outfile) as f, self._mkf('foo.out', mode='w') as o: bf = BinaryUpStreamDecoder(f) for cmd, args in bf: if cmd == 'progress': o.write('progress\t%s\n' % args[0]) elif cmd == 'output': o.write('output\t%s\n' % '\t'.join(args)) elif cmd == 'done': o.write('done\n') self.check_result('foo.out', STREAM_3)
def __main__(): factory = pp.Factory(Mapper, Reducer) pp.run_task(factory)
def __main__(): pp.run_task(factory)
def main(): return run_task(Factory(Mapper, Reducer, combiner_class=Reducer))
from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class FilterMapper(Mapper): """ Process a wordcount output stream, emitting only records relative to words whose count is equal to or above the configured threshold. """ def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.job_conf self.threshold = jc.get_int("filter.occurrence.threshold") def map(self, context): word, occurrence = context.key, context.value occurrence = struct.unpack(">i", occurrence)[0] if occurrence >= self.threshold: context.emit(word, str(occurrence)) class FilterReducer(Reducer): def reduce(self, context): pass if __name__ == "__main__": run_task(Factory(FilterMapper, FilterReducer))
def __main__(): pp.run_task(pp.Factory( mapper_class=Mapper, reducer_class=Reducer, combiner_class=Reducer ))
def main(): pp.run_task(FACTORY)
pass class ColorWriter(AvroWriter): schema = avro.schema.parse(open("stats.avsc").read()) def emit(self, key, value): self.writer.append({'office': key, 'counts': value}) class ColorPick(api.Mapper): def map(self, ctx): user = ctx.value color = user['favorite_color'] if color is not None: ctx.emit(user['office'], Counter({color: 1})) class ColorCount(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) ctx.emit(ctx.key, s) pp.run_task(pp.Factory(mapper_class=ColorPick, reducer_class=ColorCount, record_reader_class=UserReader, record_writer_class=ColorWriter), private_encoding=True)
def __main__(): pipes.run_task(pipes.Factory(mapper_class=Mapper))
def __main__(): factory = pp.Factory(Mapper, Reducer) pp.run_task(factory, context_class=Context)
def __main__(): factory = pp.Factory(Mapper, Reducer) pp.run_task(factory, private_encoding=True)
def main(): pipes.run_task(FACTORY)
def __main__(): pp.run_task(factory, auto_serialize=False)
import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer """ Count followers of each node Input : directed graph e.g.) "3 4" indicates that person 3 has 4 followers. Output : (destination, follower count) e.g.) "4 2" node 4 has 2 followers. """ class DstCountMapper(Mapper): def map(self, context): # Implements your codes line = context.value.split() context.emit(line[1], 1) class DstCountReducer(Reducer): def reduce(self, context): # Implements your codes s = sum(context.values) context.emit(context.key.encode("utf-8"), struct.pack(">i", s)) if __name__ == "__main__": factory = Factory(DstCountMapper, DstCountReducer) run_task(factory, auto_serialize=False)
def __main__(): pp.run_task(pp.Factory(mapper_class=Mapper), context_class=AvroContext)
def __main__(): """Main function to be executed by pydoop framework""" factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer, record_reader_class=Reader) pp.run_task(factory, private_encoding=True)
def __main__(): pp.run_task(pp.Factory(Mapper, Reducer))
import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class FilterMapper(Mapper): """ Process a wordcount output stream, emitting only records relative to words whose count is equal to or above the configured threshold. """ def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.job_conf self.threshold = jc.get_int("filter.occurrence.threshold") def map(self, context): word, occurrence = context.key, context.value occurrence = struct.unpack(">i", occurrence)[0] if occurrence >= self.threshold: context.emit(word, str(occurrence)) class FilterReducer(Reducer): def reduce(self, context): pass if __name__ == "__main__": run_task(Factory(FilterMapper, FilterReducer))
def test_map_only(self): factory = TFactory() with self._mkf('foo_map_only.out') as o: run_task(factory, istream=self.stream1, ostream=o)
def main(): run_task(FACTORY)
class ColorWriter(AvroWriter): schema = avro.schema.parse(open("stats.avsc").read()) def emit(self, key, value): self.writer.append({'office': key, 'counts': value}) class ColorPick(api.Mapper): def map(self, ctx): user = ctx.value color = user['favorite_color'] if color is not None: ctx.emit(user['office'], Counter({color: 1})) class ColorCount(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) ctx.emit(ctx.key, s) pp.run_task(pp.Factory( mapper_class=ColorPick, reducer_class=ColorCount, record_reader_class=UserReader, record_writer_class=ColorWriter ), private_encoding=True)
def __main__(): factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer) pp.run_task(factory, private_encoding=True, context_class=AvroContext)
def __main__(): pipes.run_task(pipes.Factory( mapper_class=Mapper, record_writer_class=Writer, ))
def __main__(): factory = pp.Factory(mapper_class=Mapper) pp.run_task(factory, context_class=AvroContext)
#!/usr/bin/env python """ Filter top-50 follower countings """ import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class FilterMapper(Mapper): def map(self, context): # Implements your codes context.emit(context.key,context.value) # pass class FilterReducer(Reducer): def reduce(self, context): # Implements your codes context.emit("","") # pass if __name__ == "__main__": factory = Factory(FilterMapper, FilterReducer) run_task(factory)
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import struct import re from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class WordCountMapper(Mapper): def map(self, context): words = re.sub('[^0-9a-zA-Z]+', ' ', context.value).split() for w in words: context.emit(w, 1) class WordCountReducer(Reducer): def reduce(self, context): s = sum(context.values) context.emit(context.key, struct.pack(">i", s)) if __name__ == "__main__": run_task(Factory(WordCountMapper, WordCountReducer))
def __main__(): pp.run_task(factory, private_encoding=False, auto_serialize=False)
super(AvroContext, self).set_job_conf(vals) schema = avro.schema.parse(self._job_conf[AVRO_SCHEMA_KEY]) self.datum_reader = DatumReader(schema) def get_input_value(self): # FIXME reuse, reuse, reuse sys.stderr.write('value: %r\n' % self._value) f = StringIO(self._value) dec = BinaryDecoder(f) return self.datum_reader.read(dec) class ColorPick(api.Mapper): def map(self, ctx): user = ctx.value color = user['favorite_color'] sys.stderr.write('user: %r' % user) if color is not None: ctx.emit(user['office'], Counter({color: 1})) class ColorCount(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) ctx.emit(ctx.key, "%r" % s) pp.run_task(pp.Factory(mapper_class=ColorPick, reducer_class=ColorCount), private_encoding=True, context_class=AvroContext)
def __main__(): pp.run_task(pp.Factory(Mapper, None))
def run_task(mapper_class, reducer_class=NoAvroColorCount): pp.run_task( pp.Factory(mapper_class=mapper_class, reducer_class=reducer_class), private_encoding=True, context_class=AvroContext )
def __main__(): pp.run_task(FACTORY, private_encoding=True, context_class=CONTEXT)