示例#1
0
from dataflows import Flow, update_resource
from datapackage_pipelines.wrapper import ingest
from datapackage_pipelines.utilities.flow_utils import spew_flow


def flow(parameters):
    resources = parameters.get('resources', None)
    metadata = parameters.pop('metadata', {})
    return Flow(
        update_resource(resources, **metadata),
    )


if __name__ == '__main__':
    with ingest() as ctx:
        spew_flow(flow(ctx.parameters), ctx)
示例#2
0
import sys
from importlib import import_module
from datapackage_pipelines.wrapper import ingest
from datapackage_pipelines.utilities.flow_utils import spew_flow

with ingest() as ctx:
    parameters, datapackage, resources = ctx
    stats = {}

    sys.path.append(parameters.pop('__path'))
    flow_module = import_module(parameters.pop('__flow'))
    flow = flow_module.flow(parameters, datapackage, resources, ctx.stats)

    spew_flow(flow, ctx)
        self.stats = {}

    def process_resource(self, resource):
        resource_path = resource.res.infer().get('path', '.')

        out_file = os.path.join(self.out_path, resource_path)
        out_file, _ = os.path.splitext(out_file)
        if not os.path.exists(os.path.dirname(out_file)):
            try:
                os.makedirs(os.path.dirname(out_file))
            except OSError:
                pass

        with RollingJSONFile(out_file + '.json', self.max_rows) as f:
            for row in resource:
                f.write(row)
                yield row


def flow(parameters: dict, stats: dict):
    out_path = parameters.pop('out-path', '.')
    max_rows = parameters.get('max-rows', 0)
    stats.setdefault(STATS_DPP_KEY, {})[STATS_OUT_DP_URL_KEY] = os.path.join(
        out_path, 'datapackage.json')
    return Flow(DumpToJson(out_path, max_rows))


if __name__ == '__main__':
    with ingest() as ctx:
        spew_flow(flow(ctx.parameters, ctx.stats), ctx)
示例#4
0
from dataflows import Flow
from datapackage_pipelines.wrapper import ingest
from datapackage_pipelines.utilities.flow_utils import spew_flow

from datapackage_pipelines_budgetkey.processors.data_gov_il_resource import flow
from datapackage_pipelines_budgetkey.common.google_chrome import google_chrome_driver


def batch_flow(parameters):
    gcd = google_chrome_driver()
    return Flow(*[flow(dict(**p, gcd=gcd)) for p in parameters['batch']])


if __name__ == '__main__':
    with ingest() as ctx:
        spew_flow(batch_flow(ctx.parameters), ctx)