Exemplo n.º 1
0
 def test_map_combiner_reduce(self):
     factory = TFactory(combiner=TReducer)
     sas = SortAndShuffle()
     run_task(factory, istream=self.stream, ostream=sas)
     with self._mkf('foo_map_combiner_reduce.out') as o:
         run_task(factory, istream=sas, ostream=o,
                  private_encoding=False)
     self.check_result('foo_map_combiner_reduce.out', STREAM_1)
Exemplo n.º 2
0
 def test_timer(self):
     factory = TFactory(mapper=SleepingMapper)
     exp_count = {
         'registerCounter': 1,
         'incrementCounter': Counter(
             [_[0] for _ in STREAM_1]
         )[TextWriter.MAP_ITEM]
     }
     with self._mkf('foo_map_only.out') as o:
         run_task(factory, istream=self.stream1, ostream=o)
         self.check_counts(o.name, exp_count)
Exemplo n.º 3
0
 def test_map_only(self):
     factory = TFactory()
     fname = self._mkfn('foo_map_only.out')
     with open(fname, 'w') as o:
         run_task(factory, istream=self.stream1, ostream=o)
     exp_count = {
         'done': 1,
         'progress': 1,
         'output': sum(len(_[2].split())
                       for _ in STREAM_1 if _[0] is TextWriter.MAP_ITEM)
     }
     self.check_counts(fname, exp_count)
Exemplo n.º 4
0
 def test_timer(self):
     factory = TFactory(mapper=SleepingMapper)
     with self._mkf('foo_map_only.out') as o:
         run_task(factory, istream=self.stream1, ostream=o)
     count = Counter()
     with open(o.name) as f:
         for line in f:
             count[line.strip().split('\t', 1)[0]] += 1
     exp_count = {
         'registerCounter': 2,
         'incrementCounter': 2 * Counter([_[0] for _ in STREAM_1])['mapItem']
     }
     for k, v in exp_count.iteritems():
         self.assertTrue(k in count)
         self.assertEqual(count[k], v)
Exemplo n.º 5
0
 def __run_test(self, mode, mapper_class, context_class):
     cmd_file = self.__write_cmd_file(mode)
     pp.run_task(
         pp.Factory(mapper_class=mapper_class), private_encoding=False,
         context_class=context_class, cmd_file=cmd_file)
     out_fn = cmd_file + '.out'
     out_records = []
     with open(out_fn, 'rb') as f:
         bf = BinaryDownStreamAdapter(f)
         for cmd, args in bf:
             if cmd == bf.OUTPUT:
                 name, color = args
                 out_records.append({'name': name, 'favorite_color': color})
     self.assertEqual(len(out_records), len(self.records))
     for out_r, r in zip(out_records, self.records):
         for k, v in iteritems(out_r):
             self.assertEqual(v.decode('UTF-8'), r[k])
Exemplo n.º 6
0
 def __run_test(self, mode, mapper_class, context_class):
     cmd_file = self.__write_cmd_file(mode)
     pp.run_task(
         pp.Factory(mapper_class=mapper_class), private_encoding=False,
         context_class=context_class, cmd_file=cmd_file
     )
     out_fn = cmd_file + '.out'
     out_records = []
     with open(out_fn) as ostream:
         for cmd, args in BinaryDownStreamFilter(ostream):
             if cmd == 'output':
                 name, color = args
                 out_records.append({'name': name, 'favorite_color': color})
     self.assertEqual(len(out_records), len(self.records))
     for out_r, r in zip(out_records, self.records):
         for k, v in out_r.iteritems():
             self.assertEqual(v, r[k])
Exemplo n.º 7
0
 def _test_map_reduce_with_private_encoding_helper(self, factory,
                                                   fast_combiner=False):
     self.stream3.close()
     cmd_file = self.stream3.name
     out_file = cmd_file + '.out'
     reduce_infile = cmd_file + '.reduce'
     reduce_outfile = reduce_infile + '.out'
     run_task(factory, cmd_file=cmd_file, private_encoding=True,
              fast_combiner=fast_combiner)
     data = {}
     bw = BinaryWriter
     with open(out_file, 'rb') as f:
         bf = BinaryDownStreamAdapter(f)
         for cmd, args in bf:
             if cmd == bw.OUTPUT:
                 data.setdefault(args[0], []).append(args[1])
     stream = []
     stream.append((bw.START_MESSAGE, 0))
     stream.append((bw.SET_JOB_CONF, 'key1', 'value1', 'key2', 'value2'))
     stream.append((bw.RUN_REDUCE, 0, 0))
     for k in data:
         stream.append((bw.REDUCE_KEY, k))
         for v in data[k]:
             stream.append((bw.REDUCE_VALUE, v))
     stream.append((bw.CLOSE,))
     binary_stream_writer(reduce_infile, stream)
     run_task(factory, cmd_file=reduce_infile, private_encoding=True)
     with open(reduce_outfile, 'rb') as f:
         with self._mkf('foo.out', mode='w') as o:
             bf = BinaryUpStreamDecoder(f)
             for cmd, args in bf:
                 if cmd == bw.PROGRESS:
                     o.write('progress\t%s\n' % args[0])
                 elif cmd == bw.OUTPUT:
                     o.write('output\t%s\n' %
                             '\t'.join([x.decode('utf-8') for x in args]))
                 elif cmd == bw.DONE:
                     o.write('done\n')
     self.check_result('foo.out', STREAM_2)
Exemplo n.º 8
0
 def _test_map_reduce_with_private_encoding_helper(self, factory,
                                                   fast_combiner=False):
     self.stream3.close()
     cmd_file = self.stream3.name
     out_file = cmd_file + '.out'
     reduce_infile = cmd_file + '.reduce'
     reduce_outfile = reduce_infile + '.out'
     run_task(factory, cmd_file=cmd_file, private_encoding=True,
              fast_combiner=fast_combiner)
     data = {}
     with open(out_file) as f:
         bf = BinaryDownStreamFilter(f)
         for cmd, args in bf:
             if cmd == 'output':
                 data.setdefault(args[0], []).append(args[1])
     stream = []
     stream.append(('start', 0))
     stream.append(('setJobConf', ('key1', 'value1', 'key2', 'value2')))
     stream.append(('runReduce', 0, False))
     for k in data:
         stream.append(('reduceKey', k))
         for v in data[k]:
             stream.append(('reduceValue', v))
     stream.append(('close',))
     binary_stream_writer(reduce_infile, stream)
     run_task(factory, cmd_file=reduce_infile, private_encoding=True)
     with open(reduce_outfile) as f, self._mkf('foo.out', mode='w') as o:
         bf = BinaryUpStreamDecoder(f)
         for cmd, args in bf:
             if cmd == 'progress':
                 o.write('progress\t%s\n' % args[0])
             elif cmd == 'output':
                 o.write('output\t%s\n' % '\t'.join(args))
             elif cmd == 'done':
                 o.write('done\n')
     self.check_result('foo.out', STREAM_3)
Exemplo n.º 9
0
def __main__():
    factory = pp.Factory(Mapper, Reducer)
    pp.run_task(factory)
Exemplo n.º 10
0
def __main__():
    pp.run_task(factory)
Exemplo n.º 11
0
def main():
    return run_task(Factory(Mapper, Reducer, combiner_class=Reducer))
Exemplo n.º 12
0
from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer


class FilterMapper(Mapper):
    """
    Process a wordcount output stream, emitting only records relative to
    words whose count is equal to or above the configured threshold.
    """
    def __init__(self, context):
        super(FilterMapper, self).__init__(context)
        jc = context.job_conf
        self.threshold = jc.get_int("filter.occurrence.threshold")

    def map(self, context):
        word, occurrence = context.key, context.value
        occurrence = struct.unpack(">i", occurrence)[0]
        if occurrence >= self.threshold:
            context.emit(word, str(occurrence))


class FilterReducer(Reducer):

    def reduce(self, context):
        pass


if __name__ == "__main__":
    run_task(Factory(FilterMapper, FilterReducer))
Exemplo n.º 13
0
def __main__():
    pp.run_task(pp.Factory(
        mapper_class=Mapper,
        reducer_class=Reducer,
        combiner_class=Reducer
    ))
Exemplo n.º 14
0
def __main__():
    pp.run_task(factory)
Exemplo n.º 15
0
def main():
    pp.run_task(FACTORY)
Exemplo n.º 16
0
    pass


class ColorWriter(AvroWriter):

    schema = avro.schema.parse(open("stats.avsc").read())

    def emit(self, key, value):
        self.writer.append({'office': key, 'counts': value})


class ColorPick(api.Mapper):
    def map(self, ctx):
        user = ctx.value
        color = user['favorite_color']
        if color is not None:
            ctx.emit(user['office'], Counter({color: 1}))


class ColorCount(api.Reducer):
    def reduce(self, ctx):
        s = sum(ctx.values, Counter())
        ctx.emit(ctx.key, s)


pp.run_task(pp.Factory(mapper_class=ColorPick,
                       reducer_class=ColorCount,
                       record_reader_class=UserReader,
                       record_writer_class=ColorWriter),
            private_encoding=True)
Exemplo n.º 17
0
def __main__():
    pipes.run_task(pipes.Factory(mapper_class=Mapper))
Exemplo n.º 18
0
def __main__():
    factory = pp.Factory(Mapper, Reducer)
    pp.run_task(factory, context_class=Context)
Exemplo n.º 19
0
def __main__():
    factory = pp.Factory(Mapper, Reducer)
    pp.run_task(factory, private_encoding=True)
Exemplo n.º 20
0
def main():
    pipes.run_task(FACTORY)
Exemplo n.º 21
0
def __main__():
    pp.run_task(factory, auto_serialize=False)
Exemplo n.º 22
0
import struct

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer
"""
Count followers of each node
Input : directed graph
    e.g.) "3   4" indicates that person 3 has 4 followers.
Output : (destination, follower count)
    e.g.) "4 2" node 4 has 2 followers.
"""


class DstCountMapper(Mapper):
    def map(self, context):
        # Implements your codes
        line = context.value.split()
        context.emit(line[1], 1)


class DstCountReducer(Reducer):
    def reduce(self, context):
        # Implements your codes
        s = sum(context.values)
        context.emit(context.key.encode("utf-8"), struct.pack(">i", s))


if __name__ == "__main__":
    factory = Factory(DstCountMapper, DstCountReducer)
    run_task(factory, auto_serialize=False)
Exemplo n.º 23
0
def __main__():
    pp.run_task(pp.Factory(mapper_class=Mapper), context_class=AvroContext)
Exemplo n.º 24
0
def __main__():
    """Main function to be executed by pydoop framework"""
    factory = pp.Factory(mapper_class=Mapper,
                         reducer_class=Reducer,
                         record_reader_class=Reader)
    pp.run_task(factory, private_encoding=True)
Exemplo n.º 25
0
def __main__():
    pp.run_task(pp.Factory(Mapper, Reducer))
Exemplo n.º 26
0
import struct

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer


class FilterMapper(Mapper):
    """
    Process a wordcount output stream, emitting only records relative to
    words whose count is equal to or above the configured threshold.
    """
    def __init__(self, context):
        super(FilterMapper, self).__init__(context)
        jc = context.job_conf
        self.threshold = jc.get_int("filter.occurrence.threshold")

    def map(self, context):
        word, occurrence = context.key, context.value
        occurrence = struct.unpack(">i", occurrence)[0]
        if occurrence >= self.threshold:
            context.emit(word, str(occurrence))


class FilterReducer(Reducer):
    def reduce(self, context):
        pass


if __name__ == "__main__":
    run_task(Factory(FilterMapper, FilterReducer))
Exemplo n.º 27
0
 def test_map_only(self):
     factory = TFactory()
     with self._mkf('foo_map_only.out') as o:
         run_task(factory, istream=self.stream1, ostream=o)
Exemplo n.º 28
0
def main():
    run_task(FACTORY)
Exemplo n.º 29
0
def __main__():
    """Main function to be executed by pydoop framework"""
    factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer, record_reader_class=Reader)
    pp.run_task(factory, private_encoding=True)
Exemplo n.º 30
0
def main():
    pipes.run_task(FACTORY)
Exemplo n.º 31
0
class ColorWriter(AvroWriter):

    schema = avro.schema.parse(open("stats.avsc").read())

    def emit(self, key, value):
        self.writer.append({'office': key, 'counts': value})


class ColorPick(api.Mapper):

    def map(self, ctx):
        user = ctx.value
        color = user['favorite_color']
        if color is not None:
            ctx.emit(user['office'], Counter({color: 1}))


class ColorCount(api.Reducer):

    def reduce(self, ctx):
        s = sum(ctx.values, Counter())
        ctx.emit(ctx.key, s)


pp.run_task(pp.Factory(
    mapper_class=ColorPick,
    reducer_class=ColorCount,
    record_reader_class=UserReader,
    record_writer_class=ColorWriter
), private_encoding=True)
Exemplo n.º 32
0
def __main__():
    factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer)
    pp.run_task(factory, private_encoding=True, context_class=AvroContext)
Exemplo n.º 33
0
def __main__():
    pipes.run_task(pipes.Factory(
        mapper_class=Mapper,
        record_writer_class=Writer,
    ))
Exemplo n.º 34
0
def __main__():
    factory = pp.Factory(mapper_class=Mapper)
    pp.run_task(factory, context_class=AvroContext)
Exemplo n.º 35
0
#!/usr/bin/env python

"""
Filter top-50 follower countings
"""

import struct

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer

class FilterMapper(Mapper):

    def map(self, context):
        # Implements your codes
	context.emit(context.key,context.value)
	#	pass

class FilterReducer(Reducer):

    def reduce(self, context):
        # Implements your codes
	context.emit("","")
	
	#	pass

if __name__ == "__main__":
    factory = Factory(FilterMapper, FilterReducer)
    run_task(factory)
Exemplo n.º 36
0
def main():
    pp.run_task(FACTORY)
Exemplo n.º 37
0
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import struct
import re

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer


class WordCountMapper(Mapper):

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.value).split()
        for w in words:
            context.emit(w, 1)


class WordCountReducer(Reducer):

    def reduce(self, context):
        s = sum(context.values)
        context.emit(context.key, struct.pack(">i", s))


if __name__ == "__main__":
    run_task(Factory(WordCountMapper, WordCountReducer))
Exemplo n.º 38
0
def __main__():
    pp.run_task(factory, private_encoding=False, auto_serialize=False)
Exemplo n.º 39
0
        super(AvroContext, self).set_job_conf(vals)
        schema = avro.schema.parse(self._job_conf[AVRO_SCHEMA_KEY])
        self.datum_reader = DatumReader(schema)

    def get_input_value(self):
        # FIXME reuse, reuse, reuse
        sys.stderr.write('value: %r\n' % self._value)
        f = StringIO(self._value)
        dec = BinaryDecoder(f)
        return self.datum_reader.read(dec)


class ColorPick(api.Mapper):
    def map(self, ctx):
        user = ctx.value
        color = user['favorite_color']
        sys.stderr.write('user: %r' % user)
        if color is not None:
            ctx.emit(user['office'], Counter({color: 1}))


class ColorCount(api.Reducer):
    def reduce(self, ctx):
        s = sum(ctx.values, Counter())
        ctx.emit(ctx.key, "%r" % s)


pp.run_task(pp.Factory(mapper_class=ColorPick, reducer_class=ColorCount),
            private_encoding=True,
            context_class=AvroContext)
Exemplo n.º 40
0
def __main__():
    pp.run_task(pp.Factory(mapper_class=Mapper), context_class=AvroContext)
Exemplo n.º 41
0
def main():
    return run_task(Factory(Mapper, Reducer, combiner_class=Reducer))
Exemplo n.º 42
0
Arquivo: nosep.py Projeto: crs4/pydoop
def __main__():
    pp.run_task(pp.Factory(Mapper, None))
Exemplo n.º 43
0
def run_task(mapper_class, reducer_class=NoAvroColorCount):
    pp.run_task(
        pp.Factory(mapper_class=mapper_class, reducer_class=reducer_class),
        private_encoding=True, context_class=AvroContext
    )
Exemplo n.º 44
0
def __main__():
    pp.run_task(FACTORY, private_encoding=True, context_class=CONTEXT)
Exemplo n.º 45
0
def __main__():
    pp.run_task(pp.Factory(Mapper, None))