示例#1
0
    def test_seqfile(self):
        """
        Test sequencefile io
        """
        tmp_file = "./.tmp/test_tmp"

        key1 = 123
        value1 = ["A", "B", "C"]

        key2 = 456
        value2 = ["D", "E", "F"]

        input_data = [(key1, value1), (key2, value2)]

        def kv_serializer(record):
            return str(record[0]), ":".join(record[1])

        pcollection_kv = self._pipeline.parallelize(input_data)
        self._pipeline.write(
            pcollection_kv,
            output.SequenceFile(tmp_file).as_type(kv_serializer))

        self._pipeline.run()

        def kv_deserializer(tp):
            return int(tp[0]), tp[1].split(":")

        result = self._pipeline.read(
            input.SequenceFile(tmp_file).as_type(kv_deserializer))
        result_data = result.get()

        self.assertItemsEqual(input_data, result_data)
示例#2
0
    def test_partition(self):
        """
        Test partition output
        """

        import os
        try:
            p = self._pipeline.parallelize(["1", "2", "3"])
            self._pipeline.write(p, output.TextFile('./output-1').partition(5))
            self._pipeline.write(
                p,
                output.SequenceFile('./output-2').partition(
                    2, lambda x, n: int(x) % n))
            self._pipeline.run()

            o1 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00000'))
            o1.cache()
            o2 = self._pipeline.read(
                input.SequenceFile('./output-2/part-00001'))
            o2.cache()
            self.assertEqual(["2"], o1.get())
            self.assertItemsEqual(["1", "3"], o2.get())

            n = os.popen('ls output-1/[^_]* | wc -l').read()
            self.assertEqual(5, int(n))
            o = self._pipeline.read(input.TextFile('output-1')).get()
            self.assertItemsEqual(["1", "2", "3"], o)
        finally:
            os.system("rm output-1 output-2 -r")
示例#3
0
 def test_overwrite(self):
     """
     Test pipeline overwrite target path
     """
     p = self._pipeline.parallelize([1])
     self._pipeline.write(p, output.SequenceFile('test_output'))
     self._pipeline.run()
     self._pipeline.write(p, output.SequenceFile('test_output'))
     self._pipeline.run()
     p1 = self._pipeline.read(input.SequenceFile('test_output'))
     p.cache()
     p1.cache()
     self.assertEqual(p.get(), p1.get())
     shutil.rmtree('test_output')
     self.assertEqual(p.get(), p1.map(lambda x: x).get())
     self.assertFalse(os.path.exists('test_output'))
示例#4
0
 def test_sequence_file(self):
     """
     Case: test sequence file
     """
     data = self._pipeline.parallelize([1, 2, 3, 400, 5])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data, output.SequenceFile(local_file))
     self._pipeline.run()
     result = self._pipeline.read(input.SequenceFile(local_file))
     self.assertItemsEqual([1, 2, 3, 400, 5], result.get())
示例#5
0
    def test_commit(self):
        """
        Case: test commit
        """
        self.setConfig(immediately_commit=True)

        vertex0 = self._pipeline.parallelize([1, 1, 2, 2, 3, 3, 4, 4, 5, 5])

        vertex1 = vertex0.group_by(lambda x: x) \
                .apply_values(lambda x: x.reduce(lambda x, y: x + y)) \
                .flatten()

        vertex1_output = self.generate_tmp_path()
        self._pipeline.write(vertex1, output.SequenceFile(vertex1_output))

        def _initializer(emitter):
            return []

        def _transformer(status, emitter, inp):
            import copy
            status.append(copy.deepcopy(inp))
            return status

        def _finalizer(status, emitter):
            emitter.emit(len(status) / 0)

        vertex2 = vertex1.group_by(lambda x: x[0] % 2, lambda x: x[1]) \
                .apply_values(lambda x: x.transform(_initializer, _transformer, _finalizer)) \
                .flatten()

        vertex2_output = self.generate_tmp_path()
        self._pipeline.write(vertex2, output.SequenceFile(vertex2_output))
        with self.assertRaises(ZeroDivisionError):
            self._pipeline.run()

        from bigflow import base
        local_pipeline = base.Pipeline.create(
            'local', hadoop_config_path=self._hadoop_config_path)
        result = local_pipeline.read(input.SequenceFile(vertex1_output))
        self.assertItemsEqual([(1, 2), (2, 4), (3, 6), (4, 8), (5, 10)],
                              result.get())
示例#6
0
 def test_sequence_file_serde(self):
     """
     Case: test sequence file serde
     """
     data = self._pipeline.parallelize([1, 2, 3, 400, 5])
     local_file = self.generate_tmp_path()
     self._pipeline.write(data,
             output.SequenceFile(local_file, serde = serde.IntSerde()))
     self._pipeline.run()
     result = self._pipeline.read(input.SequenceFile(local_file, serde = serde.IntSerde()))
     self.assertItemsEqual([1, 2, 3, 400, 5], result.get())
     result_invalid = self._pipeline.read(
             input.SequenceFile(local_file, serde = serde.TupleSerde()))
     with self.assertRaises(error.BigflowRuntimeException):
         result_invalid.get()
示例#7
0
    def test_seq_file_new_api(self):
        """
        test sequence file new api
        """
        import os

        class KeySerde(serde.Serde):
            """value serde"""
            def serialize(self, obj):
                """serialize"""
                return str(obj + 1)

            def deserialize(self, buf):
                """deserialize"""
                return int(buf) - 1

        class ValueSerde(serde.Serde):
            """value serde"""
            def serialize(self, obj):
                """serialize"""
                return str(obj * 2)

            def deserialize(self, buf):
                """deserialize"""
                return int(buf) / 2

        tmp_file = "./.tmp/test_file_1"
        os.system("rm " + tmp_file + " -rf")
        input_data = [(2, 2), (1, 6)]
        d = self._pipeline.parallelize(input_data)
        self._pipeline.write(d, output.SequenceFile(tmp_file,\
            key_serde=KeySerde(), value_serde=ValueSerde()))
        self._pipeline.run()

        read_data = self._pipeline.read(input.SequenceFile(tmp_file,\
            key_serde=KeySerde(), value_serde=ValueSerde()))
        result_data = read_data.get()
        self.assertItemsEqual(input_data, result_data)