示例#1
0
 def __init__(self, options):
     super(ODOWriter, self).__init__(options)
     from flatson import Flatson
     schema = self.read_option('schema', None)
     self.odo_uri = self.read_option('odo_uri', None)
     self.flatson = Flatson(schema)
     self.logger.info('ODOWriter has been initiated. Writing to: {}'.format(
         self.odo_uri))
示例#2
0
 def __init__(self, *args, **kwargs):
     from flatson import Flatson
     super(FlatsonTransform, self).__init__(*args, **kwargs)
     self.flatson_schema = self.read_option('flatson_schema')
     self.flatson = Flatson(self.flatson_schema)
     self.logger.info(
         'FlatsonTransform has been initiated. Schema: {!r}'.format(
             self.flatson_schema))
示例#3
0
    def test_disallow_overwriting_official_serialization_methods(self):
        # given:
        sample = {'first': 'hello', 'list': ['one', 'two']}
        schema = skinfer.generate_schema(sample)
        serialize_options = dict(method='always_one')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)
        with self.assertRaises(ValueError):
            f.register_serialization_method('extract_first', lambda _v, **kw: _v[2])
示例#4
0
    def test_convert_object_with_simple_list_with_default_serialization(self):
        contain_list = {
            'first': 'hello',
            'list': [1, 2, 3, 4],
            'list2': ['one', 'two'],
        }
        schema = skinfer.generate_schema(contain_list)

        f = Flatson(schema=schema)
        self.assertEquals(['first', 'list', 'list2'], f.fieldnames)
        self.assertEquals(['hello', '[1,2,3,4]', '["one","two"]'], f.flatten(contain_list))
示例#5
0
    def test_lists_with_objects_with_default_serialization(self):
        # given:
        schema = skinfer.generate_schema(SAMPLE_WITH_LIST_OF_OBJECTS)
        f = Flatson(schema=schema)

        # when:
        result = f.flatten(SAMPLE_WITH_LIST_OF_OBJECTS)

        # then:
        expected = '[{"key1":"value1","key2":"value2"},{"key1":"value3","key2":"value4"}]'
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', expected], result)
示例#6
0
    def test_lists_with_objects_with_default_serialization(self):
        # given:
        schema = skinfer.generate_schema(SAMPLE_WITH_LIST_OF_OBJECTS)
        f = Flatson(schema=schema)

        # when:
        result = f.flatten(SAMPLE_WITH_LIST_OF_OBJECTS)

        # then:
        expected = '[{"key1":"value1","key2":"value2"},{"key1":"value3","key2":"value4"}]'
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', expected], result)
示例#7
0
    def test_convert_object_with_simple_list_with_default_serialization(self):
        contain_list = {
            'first': 'hello',
            'list': [1, 2, 3, 4],
            'list2': ['one', 'two'],
        }
        schema = skinfer.generate_schema(contain_list)

        f = Flatson(schema=schema)
        self.assertEquals(['first', 'list', 'list2'], f.fieldnames)
        self.assertEquals(['hello', '[1,2,3,4]', '["one","two"]'],
                          f.flatten(contain_list))
示例#8
0
 def test_convert_nested_objects(self):
     contain_nested_object = {
         'first': 'hello',
         'second': {
             'one': 1,
             'two': 2,
         }
     }
     schema = skinfer.generate_schema(contain_nested_object)
     f = Flatson(schema=schema)
     self.assertEquals(['first', 'second.one', 'second.two'], f.fieldnames)
     self.assertEquals(['hello', 1, 2], f.flatten(contain_nested_object))
示例#9
0
 def test_flatten_dict(self):
     contain_nested_object = {
         'first': 'hello',
         'second': {
             'one': 1,
             'two': 2,
         }
     }
     schema = skinfer.generate_schema(contain_nested_object)
     f = Flatson(schema=schema)
     expected = {'first': 'hello', 'second.one': 1, 'second.two': 2}
     self.assertEquals(expected, f.flatten_dict(contain_nested_object))
示例#10
0
 def test_convert_nested_objects(self):
     contain_nested_object = {
         'first': 'hello',
         'second': {
             'one': 1,
             'two': 2,
         }
     }
     schema = skinfer.generate_schema(contain_nested_object)
     f = Flatson(schema=schema)
     self.assertEquals(['first', 'second.one', 'second.two'], f.fieldnames)
     self.assertEquals(['hello', 1, 2], f.flatten(contain_nested_object))
示例#11
0
    def test_disallow_overwriting_official_serialization_methods(self):
        # given:
        sample = {'first': 'hello', 'list': ['one', 'two']}
        schema = skinfer.generate_schema(sample)
        serialize_options = dict(method='always_one')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)
        with self.assertRaises(ValueError):
            f.register_serialization_method('extract_first',
                                            lambda _v, **kw: _v[2])
示例#12
0
 def test_flatten_dict(self):
     contain_nested_object = {
         'first': 'hello',
         'second': {
             'one': 1,
             'two': 2,
         }
     }
     schema = skinfer.generate_schema(contain_nested_object)
     f = Flatson(schema=schema)
     expected = {'first': 'hello', 'second.one': 1, 'second.two': 2}
     self.assertEquals(expected, f.flatten_dict(contain_nested_object))
示例#13
0
    def test_convert_object_with_nested_simple_list_with_default_serialization(self):
        contain_list = {
            'first': 'hello',
            'second': {
                'list1': [1, 2, 3, 4],
                'word': 'world',

            },
        }
        schema = skinfer.generate_schema(contain_list)
        f = Flatson(schema=schema)
        self.assertEquals(['first', 'second.list1', 'second.word'], f.fieldnames)
        self.assertEquals(['hello', '[1,2,3,4]', 'world'], f.flatten(contain_list))
示例#14
0
    def test_array_serialization_with_extract_key_values(self):
        # given:
        schema = skinfer.generate_schema(SAMPLE_WITH_LIST_OF_OBJECTS)
        serialize_options = dict(method='extract_key_values')

        # when:
        schema['properties']['list']['flatson_serialize'] = serialize_options
        f = Flatson(schema=schema)
        result = f.flatten(SAMPLE_WITH_LIST_OF_OBJECTS)

        # then:
        expected = 'key1:value1,key2:value2;key1:value3,key2:value4'
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', expected], result)
示例#15
0
    def test_array_serialization_with_extract_key_values(self):
        # given:
        schema = skinfer.generate_schema(SAMPLE_WITH_LIST_OF_OBJECTS)
        serialize_options = dict(method='extract_key_values')

        # when:
        schema['properties']['list']['flatson_serialize'] = serialize_options
        f = Flatson(schema=schema)
        result = f.flatten(SAMPLE_WITH_LIST_OF_OBJECTS)

        # then:
        expected = 'key1:value1,key2:value2;key1:value3,key2:value4'
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', expected], result)
示例#16
0
 def test_convert_object_with_nested_simple_list_with_default_serialization(
         self):
     contain_list = {
         'first': 'hello',
         'second': {
             'list1': [1, 2, 3, 4],
             'word': 'world',
         },
     }
     schema = skinfer.generate_schema(contain_list)
     f = Flatson(schema=schema)
     self.assertEquals(['first', 'second.list1', 'second.word'],
                       f.fieldnames)
     self.assertEquals(['hello', '[1,2,3,4]', 'world'],
                       f.flatten(contain_list))
示例#17
0
    def test_register_custom_serialization_method(self):
        # given:
        sample = {'first': 'hello', 'list': ['one', 'two']}
        schema = skinfer.generate_schema(sample)
        serialize_options = dict(method='always_one')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)
        f.register_serialization_method('always_one', lambda _v, **kw: '1')
        result = f.flatten(sample)

        # then:
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', '1'], result)
示例#18
0
    def test_register_custom_serialization_method(self):
        # given:
        sample = {'first': 'hello', 'list': ['one', 'two']}
        schema = skinfer.generate_schema(sample)
        serialize_options = dict(method='always_one')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)
        f.register_serialization_method('always_one', lambda _v, **kw: '1')
        result = f.flatten(sample)

        # then:
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', '1'], result)
示例#19
0
class FlatsonTransform(BaseTransform):
    """
    It flatten a JSON-like dataset into flat CSV-like tables using the
    Flatson library, please refer to Flatson
    `official documentation
    <http://flatson.readthedocs.io/en/latest/readme.html>`_.

        - flatson_schema (dict)
            Valid Flatson schema
    """
    # List of options to set up the transform module
    supported_options = {
        'flatson_schema': {'type': dict}
    }

    def __init__(self, *args, **kwargs):
        from flatson import Flatson
        super(FlatsonTransform, self).__init__(*args, **kwargs)
        self.flatson_schema = self.read_option('flatson_schema')
        self.flatson = Flatson(self.flatson_schema)
        self.logger.info(
            'FlatsonTransform has been initiated. Schema: {!r}'.format(
                self.flatson_schema))

    def transform_batch(self, batch):
        for record in batch:
            yield self.flatson.flatten_dict(record)
示例#20
0
class FlatsonTransform(BaseTransform):
    """
    It flatten a JSON-like dataset into flat CSV-like tables using the
    Flatson library, please refer to Flatson
    `official documentation
    <http://flatson.readthedocs.io/en/latest/readme.html>`_.

        - flatson_schema (dict)
            Valid Flatson schema
    """
    # List of options to set up the transform module
    supported_options = {'flatson_schema': {'type': dict}}

    def __init__(self, *args, **kwargs):
        from flatson import Flatson
        super(FlatsonTransform, self).__init__(*args, **kwargs)
        self.flatson_schema = self.read_option('flatson_schema')
        self.flatson = Flatson(self.flatson_schema)
        self.logger.info(
            'FlatsonTransform has been initiated. Schema: {!r}'.format(
                self.flatson_schema))

    def transform_batch(self, batch):
        for record in batch:
            yield self.flatson.flatten_dict(record)
示例#21
0
 def __init__(self, options):
     super(ODOWriter, self).__init__(options)
     from flatson import Flatson
     schema = self.read_option('schema', None)
     self.odo_uri = self.read_option('odo_uri', None)
     self.flatson = Flatson(schema)
     self.logger.info('ODOWriter has been initiated. Writing to: {}'.format(self.odo_uri))
示例#22
0
 def __init__(self, *args, **kwargs):
     from flatson import Flatson
     super(FlatsonTransform, self).__init__(*args, **kwargs)
     self.flatson_schema = self.read_option('flatson_schema')
     self.flatson = Flatson(self.flatson_schema)
     self.logger.info(
         'FlatsonTransform has been initiated. Schema: {!r}'.format(
             self.flatson_schema))
示例#23
0
    def test_create_from_schemafile(self):
        _, fname = tempfile.mkstemp()
        try:
            with open(fname, 'w') as f:
                json.dump(SIMPLE_SCHEMA, f)

            obj = Flatson.from_schemafile(fname)
            self.assertEquals(SIMPLE_SCHEMA, obj.schema)
        finally:
            os.remove(fname)
示例#24
0
    def test_create_from_schemafile(self):
        _, fname = tempfile.mkstemp()
        try:
            with open(fname, 'w') as f:
                json.dump(SIMPLE_SCHEMA, f)

            obj = Flatson.from_schemafile(fname)
            self.assertEquals(SIMPLE_SCHEMA, obj.schema)
        finally:
            os.remove(fname)
示例#25
0
    def test_convert_object_with_simple_list_with_join_serialization(self):
        # given:
        contain_list = {
            'first': 'hello',
            'list': [1, 2, 3, 4],
            'list2': ['one', 'two'],
        }
        schema = skinfer.generate_schema(contain_list)
        serialize_options = dict(method='join_values')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)

        # then:
        self.assertEquals(['first', 'list', 'list2'], f.fieldnames)
        self.assertEquals(['hello', '1,2,3,4', '["one","two"]'],
                          f.flatten(contain_list))

        # and when:
        schema['properties']['list']['flatson_serialize']['separator'] = '+'
        f = Flatson(schema=schema)

        # then:
        self.assertEquals(['hello', '1+2+3+4', '["one","two"]'],
                          f.flatten(contain_list))
示例#26
0
    def test_array_serialization_with_extract_first(self):
        # given:
        sample = {'first': 'hello', 'list': ['one', 'two']}
        schema = skinfer.generate_schema(sample)
        serialize_options = dict(method='extract_first')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)
        result = f.flatten(sample)

        # then:
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', 'one'], result)

        # and when:
        sample2 = {'first': 'hello', 'list': []}
        result = f.flatten(sample2)

        # then:
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', None], result)
示例#27
0
    def test_array_serialization_with_extract_first(self):
        # given:
        sample = {'first': 'hello', 'list': ['one', 'two']}
        schema = skinfer.generate_schema(sample)
        serialize_options = dict(method='extract_first')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)
        result = f.flatten(sample)

        # then:
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', 'one'], result)

        # and when:
        sample2 = {'first': 'hello', 'list': []}
        result = f.flatten(sample2)

        # then:
        self.assertEquals(['first', 'list'], f.fieldnames)
        self.assertEquals(['hello', None], result)
示例#28
0
    def test_convert_object_with_simple_list_with_join_serialization(self):
        # given:
        contain_list = {
            'first': 'hello',
            'list': [1, 2, 3, 4],
            'list2': ['one', 'two'],
        }
        schema = skinfer.generate_schema(contain_list)
        serialize_options = dict(method='join_values')
        schema['properties']['list']['flatson_serialize'] = serialize_options

        # when:
        f = Flatson(schema=schema)

        # then:
        self.assertEquals(['first', 'list', 'list2'], f.fieldnames)
        self.assertEquals(['hello', '1,2,3,4', '["one","two"]'], f.flatten(contain_list))

        # and when:
        schema['properties']['list']['flatson_serialize']['separator'] = '+'
        f = Flatson(schema=schema)

        # then:
        self.assertEquals(['hello', '1+2+3+4', '["one","two"]'], f.flatten(contain_list))
示例#29
0
class ODOWriter(BaseWriter):
    """
    Writes items to a odo destination. https://odo.readthedocs.org/en/latest/

    Needed parameters:

        - schema (object)
            schema object.

        - odo_uri (str)
            ODO valid destination uri.
    """

    requirements = {
        'schema': {
            'type': object,
            'required': True
        },
        'odo_uri': {
            'type': six.string_types,
            'required': True
        }
    }

    def __init__(self, options):
        super(ODOWriter, self).__init__(options)
        from flatson import Flatson
        schema = self.read_option('schema', None)
        self.odo_uri = self.read_option('odo_uri', None)
        self.flatson = Flatson(schema)
        self.logger.info('ODOWriter has been initiated. Writing to: {}'.format(
            self.odo_uri))

    @retry_long
    def write(self, dump_path, group_key=''):
        from odo import odo, resource, discover
        import pandas as pd
        with gzip.open(dump_path) as f:
            lines = [
                json.loads(line.replace('\n', '')) for line in f.readlines()
            ]
        flattened_lines = (self.flatson.flatten(line) for line in lines)
        pf = pd.DataFrame(flattened_lines, columns=self.flatson.fieldnames)
        dshape = discover(pf)
        odo(pf, resource(self.odo_uri), dshape=dshape)
示例#30
0
class ODOWriter(BaseWriter):
    """
    Writes items to a odo destination. https://odo.readthedocs.org/en/latest/

    Needed parameters:

        - schema (object)
            schema object.

        - odo_uri (str)
            ODO valid destination uri.
    """

    requirements = {
        'schema': {'type': object, 'required': True},
        'odo_uri': {'type': basestring, 'required': True}
    }

    def __init__(self, options):
        super(ODOWriter, self).__init__(options)
        from flatson import Flatson
        schema = self.read_option('schema', None)
        self.odo_uri = self.read_option('odo_uri', None)
        self.flatson = Flatson(schema)
        self.logger.info('ODOWriter has been initiated. Writing to: {}'.format(self.odo_uri))

    @retry_long
    def write(self, dump_path, group_key=''):
        from odo import odo, resource, discover
        import pandas as pd
        with gzip.open(dump_path) as f:
            lines = [json.loads(line.replace('\n', '')) for line in f.readlines()]
        flattened_lines = (self.flatson.flatten(line) for line in lines)
        pf = pd.DataFrame(flattened_lines, columns=self.flatson.fieldnames)
        dshape = discover(pf)
        odo(pf, resource(self.odo_uri), dshape=dshape)
示例#31
0
 def test_convert_simple_objects(self):
     f = Flatson(schema=SIMPLE_SCHEMA)
     self.assertEquals(['a_prop'], f.fieldnames)
     self.assertEquals(['a_value'], f.flatten({'a_prop': 'a_value'}))
     self.assertEquals([None], f.flatten({}))
示例#32
0
 def test_when_no_declared_properties_flatten_empty_list(self):
     f = Flatson(schema=EMPTY_SCHEMA)
     result = f.flatten({'a_prop': 'a_value'})
     self.assertEquals([], result)
示例#33
0
 def test_when_no_declared_properties_flatten_empty_list(self):
     f = Flatson(schema=EMPTY_SCHEMA)
     result = f.flatten({'a_prop': 'a_value'})
     self.assertEquals([], result)
示例#34
0
 def test_no_support_for_list_objects(self):
     with self.assertRaises(ValueError):
         Flatson(schema=LIST_SCHEMA)
示例#35
0
 def test_create(self):
     f = Flatson(schema=SIMPLE_SCHEMA)
     assert f.schema == SIMPLE_SCHEMA
示例#36
0
 def test_convert_simple_objects(self):
     f = Flatson(schema=SIMPLE_SCHEMA)
     self.assertEquals(['a_prop'], f.fieldnames)
     self.assertEquals(['a_value'], f.flatten({'a_prop': 'a_value'}))
     self.assertEquals([None], f.flatten({}))