class iterable_loader(DataStreamProcessor): def __init__(self, iterable, name=None): super(iterable_loader, self).__init__() self.iterable = iterable self.name = name def handle_iterable(self): mode = None for x in self.iterable: if mode is None: assert isinstance(x, (dict, list)) mode = dict if isinstance(x, dict) else list assert isinstance(x, mode) if mode == dict: yield x else: yield dict(zip(('col{}'.format(i) for i in range(len(x))), x)) def process_datapackage(self, dp: Package): name = self.name if name is None: name = 'res_{}'.format(len(dp.resources) + 1) self.res = Resource(dict(name=name, path='{}.csv'.format(name)), storage=iterable_storage(self.handle_iterable())) self.res.infer() dp.descriptor.setdefault('resources', []).append(self.res.descriptor) return dp def process_resources(self, resources): yield from super(iterable_loader, self).process_resources(resources) yield self.res.iter(keyed=True)
class load(DataStreamProcessor): def __init__(self, load_source, name=None, resources=None, validate=False, strip=True, **options): super(load, self).__init__() self.load_source = load_source self.options = options self.name = name self.resources = resources self.load_dp = None self.validate = validate self.strip = strip self.force_strings = options.get('force_strings') is True def process_datapackage(self, dp: Package): if isinstance(self.load_source, tuple): datapackage_descriptor, _ = self.load_source dp.descriptor.setdefault('resources', []) self.resource_matcher = ResourceMatcher(self.resources, datapackage_descriptor) for resource_descriptor in datapackage_descriptor['resources']: if self.resource_matcher.match(resource_descriptor['name']): dp.add_resource(resource_descriptor) else: # load_source is string: if self.load_source.startswith('env://'): env_var = self.load_source[6:] self.load_source = os.environ.get(env_var) if self.load_source is None: raise ValueError( f"Couldn't find value for env var '{env_var}'") if os.path.basename(self.load_source) == 'datapackage.json': self.load_dp = Package(self.load_source) self.resource_matcher = ResourceMatcher( self.resources, self.load_dp) dp.descriptor.setdefault('resources', []) for resource in self.load_dp.resources: if self.resource_matcher.match(resource.name): dp.add_resource(resource.descriptor) else: if os.path.exists(self.load_source): base_path = os.path.dirname(self.load_source) or '.' self.load_source = os.path.basename(self.load_source) else: base_path = None descriptor = dict(path=self.load_source, profile='tabular-data-resource') descriptor['format'] = self.options.get('format') if 'encoding' in self.options: descriptor['encoding'] = self.options['encoding'] if descriptor['format'] == 'xml' or self.load_source.endswith( '.xml'): self.options.setdefault('custom_parsers', {})['xml'] = XMLParser self.options.setdefault('ignore_blank_headers', True) self.options.setdefault('headers', 1) self.res = Resource(descriptor, base_path=base_path, **self.options) self.res.infer(confidence=1, limit=1000) if self.name is not None: self.res.descriptor['name'] = self.name if self.force_strings: for f in self.res.descriptor['schema']['fields']: f['type'] = 'string' self.res.commit() self.res.descriptor['path'] = '{name}.{format}'.format( **self.res.descriptor) dp.add_resource(self.res.descriptor) return dp def stripper(self, iterator): for r in iterator: yield dict((k, v.strip()) if isinstance(v, str) else (k, v) for k, v in r.items()) def process_resources(self, resources): yield from super(load, self).process_resources(resources) if isinstance(self.load_source, tuple): datapackage_descriptor, resources = self.load_source yield from (resource for resource, descriptor in zip( resources, datapackage_descriptor['resources']) if self.resource_matcher.match(descriptor['name'])) elif self.load_dp is not None: yield from (resource.iter(keyed=True) for resource in self.load_dp.resources if self.resource_matcher.match(resource.name)) else: it = self.res.iter(keyed=True, cast=False) if self.validate: it = schema_validator(self.res, it) if self.strip: it = self.stripper(it) yield it