Exemplo n.º 1
0
    def run_mr(self,
               prefix,
               input_data,
               input_format=parallel.LineInput(),
               mapper=parallel.IdentityMapper(),
               reducer=parallel.IdentityReducer(),
               output_format=parallel.LevelDBOutput(),
               num_shards=5):
        os.system('rm -rf "%s"' % prefix)
        source = self.make_files(os.path.join(prefix, 'input'), input_data,
                                 input_format)
        output_prefix = os.path.join(prefix, 'output')

        parallel.mapreduce(source,
                           mapper=mapper,
                           reducer=reducer,
                           output_format=output_format,
                           output_prefix=output_prefix,
                           num_shards=num_shards)

        if isinstance(output_format, parallel.LevelDBOutput):
            return sorted(list(parallel.ShardedDB.open(output_prefix)))

        if isinstance(output_format, parallel.JSONOutput):
            return json.load(open(output_prefix))

        if isinstance(output_format, parallel.JSONLineOutput):
            result = []
            with open(output_prefix, 'r') as input_f:
                for line in input_f:
                    result.append(json.loads(line))
            return result
Exemplo n.º 2
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.JSONLineInput()),
                        mapper=parallel.IdentityMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path,
                        num_shards=1)
Exemplo n.º 3
0
    def test_identity(self):
        os.system('rm -rf /tmp/test-identity*')
        source_files = ['/tmp/test-identity-%d' % i for i in range(10)]
        for f in source_files:
            os.system('touch "%s"' % f)

        source = parallel.Collection(source_files, parallel.FilenameInput)
        parallel.mapreduce(source, parallel.IdentityMapper(),
                           parallel.IdentityReducer(), '/tmp/test-identity', 2)

        results = sorted(list(parallel.ShardedDB.open('/tmp/test-identity/')))
        for i in range(10):
            key, value = results[i]
            assert key == '/tmp/test-identity-%d' % i, results[i]
            assert value == ''
Exemplo n.º 4
0
    def test_sum(self):
        os.system('rm -rf /tmp/test-sum*')
        source_files = ['/tmp/test-sum-%d' % i for i in range(10)]
        for filename in source_files:
            with open(filename, 'w') as f:
                print >> f, '\n'.join([str(i) for i in range(100)])

        source = parallel.Collection(source_files, parallel.LineInput)
        parallel.mapreduce(source, parallel.IdentityMapper(),
                           parallel.SumReducer(), '/tmp/test-sum', 5)

        results = dict(parallel.ShardedDB.open('/tmp/test-sum/'))
        for i in range(100):
            assert str(i) in results, str(i)
            value = results[str(i)]
            self.assertEqual(value, str(i * 10.0))