def process_row(row, row_index, spec, resource_index, parameters, stats): resource_matcher = ResourceMatcher(parameters['resource_name']) if resource_matcher.match(spec['name']): clean_field_code = parameters['clean_field_code'] clean_field_name = parameters['clean_field_name'] raw_field = parameters['raw_field'] raw_field_value = row[raw_field] if not raw_field_value: return clean_value_code = None clean_value_name = None ret = fw_process.extractOne(raw_field_value, all_country_names, score_cutoff=80) if ret is not None: country, score = ret if country in all_country_initials: country = all_country_initials[country] try: country = pycountry.countries.lookup(country) clean_value_code = country.alpha_3 clean_value_name = country.name except LookupError: # Ignore values we don't know how to clean pass row[clean_field_code] = clean_value_code row[clean_field_name] = clean_value_name return row
def __call__(self): url = self.parameters['url'] dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency resource = self.parameters['resource'] stream = self.parameters.get('stream', True) name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False dp = datapackage.DataPackage(url) dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True orig_res.descriptor[PROP_STREAMED_FROM] = orig_res.source self.dp['resources'].append(orig_res.descriptor) if tabular(orig_res.descriptor) and stream: orig_res.descriptor[PROP_STREAMING] = True selected_resources.append(orig_res.iter(keyed=True)) else: orig_res.descriptor[PROP_STREAMING] = False assert found, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
def process_row(row, row_index, spec, resource_index, parameters, stats): resource_matcher = ResourceMatcher(parameters['resource-name']) if resource_matcher.match(spec['name']): fingerprint_field = parameters['fingerprint-field'] name_field = parameters['name-field'] row[fingerprint_field] = slugify(row[name_field], to_lower=True) return row
def modify_datapackage(dp, parameters, stats): resource_matcher = ResourceMatcher(parameters['resource-name']) for res in dp['resources']: if resource_matcher.match(res['name']): res['schema']['fields'].extend([ { 'name': parameters['fingerprint-field'], 'type': 'string' }, ]) return dp
def __call__(self): url = self.parameters['url'] limit_rows = self.parameters.get('limit-rows') dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency resource = self.parameters['resource'] stream = self.parameters.get('stream', True) name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False dp = datapackage.DataPackage(url) dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True desc = copy.deepcopy(orig_res.descriptor) if 'primaryKey' in desc.get('schema', {}): # Avoid duplication checks del orig_res.descriptor['schema']['primaryKey'] orig_res.commit() desc[PROP_STREAMED_FROM] = orig_res.source self.dp['resources'].append(desc) if tabular(desc) and stream: desc[PROP_STREAMING] = True orig_res_iter = orig_res.iter(keyed=True) if limit_rows: orig_res_iter = itertools.islice( orig_res_iter, limit_rows) selected_resources.append(orig_res_iter) else: desc[PROP_STREAMING] = False assert found, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
def __init__(self, ingest_response=None, default_input_resource=None, default_output_resource=None, default_replace_resource=True, table_schema=None, resource_filter=None): if not ingest_response: ingest_response = ingest() self.parameters, self.datapackage, self.resource_iterator = ingest_response self.set_default_parameters(default_input_resource, default_output_resource, default_replace_resource) self._resource_filter_param = resource_filter self.input_resource_matcher = ResourceMatcher( self.parameters["input_resource"]) self.output_resource_name = self.parameters["output_resource"] self.output_resource_descriptor = { "name": self.output_resource_name, PROP_STREAMING: True, "path": "data/{}.csv".format(self.output_resource_name), "schema": table_schema }
import itertools import datapackage from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher from datapackage_pipelines.utilities.resources import tabular, PROP_STREAMING parameters, dp, res_iter = ingest() url = parameters['url'] resource = parameters['resource'] name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False datapackage = datapackage.DataPackage(url) for i, orig_res in enumerate(datapackage.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True dp['resources'].append(orig_res.descriptor) if tabular(orig_res.descriptor): orig_res.descriptor[PROP_STREAMING] = True selected_resources.append(orig_res.iter(keyed=True)) assert found, "Failed to find resource with index or name matching %r" % resource spew(dp, itertools.chain(res_iter, selected_resources))
from datetime import date from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher from decimal import Decimal parameters, dp, res_iter = ingest() resource_matcher = ResourceMatcher(parameters.get('resource')) key = parameters['key'] collated_field_name = parameters['collated-field-name'] assert isinstance(key, list) for res in dp['resources']: if resource_matcher.match(res['name']): outer_fields = [] inner_fields = [] for field in res['schema']['fields']: if field['name'] in key: outer_fields.append(field) else: inner_fields.append(field) outer_fields.append({ 'name': collated_field_name, 'type': 'object', 'es:schema': { 'fields': inner_fields } }) schema = { 'fields': outer_fields,
import copy import re from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher parameters, datapackage, resource_iterator = ingest() resources = ResourceMatcher(parameters.get('resources')) unpivot_fields = parameters.get('unpivot') extra_keys = parameters.get('extraKeyFields') extra_value = parameters.get('extraValueField') def match_fields(field_name_re, expected): def filt(field): return (field_name_re.fullmatch(field['name']) is not None) is expected return filt def process_datapackage(datapackage_): unpivot_fields_without_regex = [] for resource in datapackage_['resources']: name = resource['name'] if not resources.match(name): continue if 'schema' not in resource: continue fields = resource['schema'].get('fields', [])
def __call__(self): self.parameters['resource'] = self.parameters['resource-name'] kv_cache = self.parameters.get('kv-cache', False) kv_path = self.parameters['kv-path'] url = self.parameters['url'] limit_rows = self.parameters.get('limit-rows') log_progress_rows = self.parameters.get('log-progress-rows') dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency stream = self.parameters.get('stream', True) required = self.parameters.get('required', True) resource = self.parameters.get('resource') resources = self.parameters.get('resources') if resource is not None: assert not resources resource_index = resource if isinstance(resource, int) else None else: assert resources resource_index = None resource = list(resources.keys()) name_matcher = ResourceMatcher(resource) if isinstance(resource, (str, list)) else None selected_resources = [] found = False try: dp = datapackage.DataPackage(url) except Exception: if required: raise else: dp = None if dp: dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True desc = copy.deepcopy(orig_res.descriptor) if 'primaryKey' in desc.get('schema', {}): # Avoid duplication checks del orig_res.descriptor['schema']['primaryKey'] orig_res.commit() desc[PROP_STREAMED_FROM] = orig_res.source if resources: desc.update(resources[desc['name']]) self.dp['resources'].append(desc) if tabular(desc) and stream: desc[PROP_STREAMING] = True if kv_cache and os.path.exists(kv_path): kv = PersistentKVFile(kv_path, concurrent=True) orig_res_iter = kv_res_iter(kv, kv_key=self.parameters.get('kv-key')) else: kv = PersistentKVFile(kv_path, concurrent=True) orig_res_iter = kv_res_iter(kv, orig_res.iter(keyed=True), kv_key=self.parameters.get('kv-key')) if limit_rows: orig_res_iter = itertools.islice(orig_res_iter, limit_rows) if log_progress_rows: orig_res_iter = progress_logger(orig_res_iter, log_progress_rows) selected_resources.append(orig_res_iter) else: desc[PROP_STREAMING] = False assert found or not required, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
import collections from datapackage_pipelines.wrapper import spew, ingest from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher import logging log = logging.getLogger(__name__) parameters, datapackage, res_iter = ingest() resource_name = parameters['name'] resources_matcher = ResourceMatcher(resource_name) datapackage['resources'] = [res for res in datapackage['resources'] if not resources_matcher.match(res['name'])] def process_resources(res_iter_): while True: resource_ = next(res_iter_) if resources_matcher.match(resource_.spec['name']): # This is the one we're deleting, empty the iterator. collections.deque(resource_, maxlen=0) else: yield resource_ spew(datapackage, process_resources(res_iter))
_resource['schema'] = schema close() del stream return itertools\ .islice( _reader( get_opener(_url, _resource), _url), 1, None) parameters, datapackage, resource_iterator = ingest() resources = ResourceMatcher(parameters.get('resources')) ignore_missing = parameters.get('ignore-missing', False) new_resource_iterator = [] for resource in datapackage['resources']: if streamable(resource): url = resource[PROP_STREAMED_FROM] name = resource['name'] if not resources.match(name): continue path = get_path(resource) if path is None or path == PATH_PLACEHOLDER: path = os.path.join('data', name + '.csv')
import itertools from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher parameters, datapackage, resource_iterator = ingest() sources = ResourceMatcher(parameters.get('sources')) target = parameters.get('target', {}) if 'name' not in target: target['name'] = 'concat' if 'path' not in target: target['path'] = 'data/' + target['name'] + '.csv' target.update(dict(mediatype='text/csv', schema=dict(fields=[], primaryKey=[]))) fields = parameters['fields'] # Create mapping between source field names to target field names field_mapping = {} for target_field, source_fields in fields.items(): if source_fields is not None: for source_field in source_fields: if source_field in field_mapping: raise RuntimeError('Duplicate appearance of %s (%r)' % (source_field, field_mapping)) field_mapping[source_field] = target_field if target_field in field_mapping: raise RuntimeError('Duplicate appearance of %s' % target_field)
from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.kvstore import KVStore from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher class KeyCalc(object): def __init__(self, key_spec): self.key_spec = key_spec def __call__(self, row): return self.key_spec.format(**row) parameters, datapackage, resource_iterator = ingest() resources = ResourceMatcher(parameters['resources']) key_calc = KeyCalc(parameters['sort-by']) def sorter(resource): db = KVStore() for row_num, row in enumerate(resource): key = key_calc(row) + "{:08x}".format(row_num) db[key] = row for key in db.keys(): yield db[key] def new_resource_iterator(resource_iterator_): for resource in resource_iterator_: if resources.match(resource.spec['name']):
from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.generators import slugify from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher parameters, dp, res_iter = ingest() resource_name = parameters['resource-name'] resource_matcher = ResourceMatcher(resource_name) source_fields = parameters['source-fields'] name_field = parameters['name-field'] fingerprint_field = parameters['fingerprint-field'] def process_resource(res): all_fingerprints = set() for row in res: name = None for src_field in source_fields: src_value = row[src_field] if src_value: if name is None: name = src_value fingerprint = slugify(src_value, to_lower=True) if fingerprint in all_fingerprints: continue all_fingerprints.add(fingerprint) yield {name_field: name, fingerprint_field: fingerprint} def process_resources(resources): for res in resources:
yield process_resource(res, afield, tfield) else: yield res def modify_datapackage(dp, resource_matcher, afield, tfield): for res in dp['resources']: if not resource_matcher.match(res['name']): continue field = [ f for f in res['schema']['fields'] if f['name'] == afield ][0] fields = [ f for f in res['schema']['fields'] if f['name'] != afield ] fields.append({ 'name': tfield, 'type': field['es:itemType'] if 'es:itemType' in field else 'string' }) res['schema']['fields'] = fields return dp if __name__ == '__main__': parameters, dp, res_iter = ingest() resource_matcher = ResourceMatcher(parameters.get('resource')) afield, tfield = parameters['array-field'], parameters['unwound-field'] spew(modify_datapackage(dp, resource_matcher, afield, tfield), process_resources(res_iter, resource_matcher, afield, tfield))
class ResourceFilterProcessor(object): def __init__(self, ingest_response=None, default_input_resource=None, default_output_resource=None, default_replace_resource=True, table_schema=None, resource_filter=None): if not ingest_response: ingest_response = ingest() self.parameters, self.datapackage, self.resource_iterator = ingest_response self.set_default_parameters(default_input_resource, default_output_resource, default_replace_resource) self._resource_filter_param = resource_filter self.input_resource_matcher = ResourceMatcher( self.parameters["input_resource"]) self.output_resource_name = self.parameters["output_resource"] self.output_resource_descriptor = { "name": self.output_resource_name, PROP_STREAMING: True, "path": "data/{}.csv".format(self.output_resource_name), "schema": table_schema } def set_default_parameters(self, default_input_resource, default_output_resource, default_replace_resource): self.parameters.setdefault("input_resource", default_input_resource) self.parameters.setdefault("output_resource", default_output_resource) self.parameters.setdefault("replace_resource", default_replace_resource) def filter_data(self): for resource_descriptor in self.datapackage["resources"]: resource_data = next(self.resource_iterator) if self._is_matching_resource(resource_descriptor): yield self.filter_resource_data(resource_data, self.parameters) else: yield resource_data def filter_datapackage(self): if self.parameters["replace_resource"]: for resource in self.datapackage["resources"]: if self.input_resource_matcher.match(resource["name"]): resource.update(self.output_resource_descriptor) else: self.datapackage["resources"].append( self.output_resource_descriptor) return self.datapackage def filter_resource_data(self, data, parameters): return self._resource_filter_param(data, parameters) def spew(self): spew(*self._get_spew_params()) def get_stats(self): return {} @classmethod def main(cls, **kwargs): cls(ingest_response=ingest(), **kwargs).spew() def _get_spew_params(self): datapackage = self.filter_datapackage() return datapackage, self.filter_data(), self.get_stats() def _is_matching_resource(self, resource_descriptor): return resource_descriptor["name"] == self.parameters[ "output_resource"]