Python spew示例，datapackage_pipelines.wrapper.spew Python示例

示例#1

0

显示文件

文件： archive_images.py 项目： OriHoch/nli-data-pipelines

def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}, "sysnum_images": {}}
    fields = [{
        "name": "manifest_label",
        "type": "string"
    }, {
        "name": "manifest_sysnum",
        "type": "string"
    }, {
        "name": "resource_id",
        "type": "string"
    }, {
        "name": "resource_type",
        "type": "string"
    }, {
        "name": "resource_format",
        "type": "string"
    }, {
        "name": "resource_width",
        "type": "number"
    }, {
        "name": "resource_height",
        "type": "number"
    }, {
        "name": "resource_filepath",
        "type": "string"
    }, {
        "name": "url",
        "type": "string"
    }, {
        "name": "downloaded",
        "type": "boolean"
    }]
    output_resources = []
    output_descriptors = []
    for resource, descriptor in zip(resources, datapackage["resources"]):
        logging.info("creating images archive for collection {}".format(
            descriptor["name"]))
        output_resources.append(
            get_resource(resource, aggregations, descriptor["name"]))
        output_descriptors.append({
            PROP_STREAMING: True,
            "name": descriptor["name"],
            "path": "{}.csv".format(descriptor["name"]),
            "schema": {
                "fields": fields
            }
        })
    datapackage["resources"] = output_descriptors
    spew(datapackage, output_resources, aggregations["stats"])

示例#2

0

显示文件

def main():
    parameters, datapackage, resources = ingest()
    for resource in datapackage["resources"]:
        if resource["name"] == "manifests":
            for field in resource["schema"]["fields"]:
                if field["name"] in ["attribution", "subject", "alternative_title", "title", "the_creator",
                                     "publisher", "label", "description"]:
                    field["es:type"] = "text"
                elif field["name"] in ["map", "sysnum", "language", "collection", "base"]:
                    field["es:type"] = "keyword"
                else:
                    field["es:type"] = "text"

    spew(datapackage, resources)

示例#3

0

显示文件

文件： zio_download_images.py 项目： wmil-1946/wikiscraper

def main():
    parameters, datapackage, resources, stats = ingest() + (defaultdict(int), )
    max_year = parameters.get('max-year')
    file_path_template = parameters.get('file-path-template')
    missing_image = parameters.get('missing-image')
    datapackage['resources'] = []
    for resource in resources:
        for rownum, row in enumerate(resource):
            if max_year and row['year'] > max_year:
                stats['invalid year'] += 1
                continue
            if parameters.get('download-thumbnails'):
                if not row['thumb_url']:
                    stats['missing thumb_url'] += 1
                    continue
                name = 'rownum_{}'.format(rownum)
                if file_path_template:
                    photo_filename = file_path_template.format(rownum=rownum)
                    if not path.exists(photo_filename):
                        stats['full size photo missing'] += 1
                        continue
                    if missing_image:
                        if filecmp.cmp(photo_filename,
                                       missing_image,
                                       shallow=False):
                            stats['photo is missing_image photo'] += 1
                            continue
                stats['valid thumbnail'] += 1
                url = row['thumb_url']
                datapackage['resources'].append({
                    PROP_STREAMED_FROM:
                    url,
                    'name':
                    name,
                    'path': ['files/' + name + '.jpg'],
                })
            else:
                if row['image_url']:
                    url = parameters['image_url_prefix'] + row['image_url']
                    name = 'rownum_{}'.format(rownum)
                    datapackage['resources'].append({
                        PROP_STREAMED_FROM:
                        url,
                        'name':
                        name,
                        'path': ['files/' + name + '.png'],
                    })

    spew(datapackage, [], stats)

示例#4

0

显示文件

    def __call__(self):
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                desc = copy.deepcopy(orig_res.descriptor)
                if 'primaryKey' in desc.get('schema', {}):
                    # Avoid duplication checks
                    del orig_res.descriptor['schema']['primaryKey']
                    orig_res.commit()
                desc[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(desc)
                if tabular(desc) and stream:
                    desc[PROP_STREAMING] = True
                    orig_res_iter = orig_res.iter(keyed=True)
                    if limit_rows:
                        orig_res_iter = itertools.islice(
                            orig_res_iter, limit_rows)
                    selected_resources.append(orig_res_iter)
                else:
                    desc[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))

示例#5

0

显示文件

文件： test_wrapper.py 项目： slallum/datapackage-pipelines

    def test_spew_finalizer_runs_before_we_signal_that_were_done(self):
        '''Assert that the finalizer param is executed before spew is finished.

        We signal to other processors that we're done by writing an empty line
        to STDOUT. The finalizer parameter to spew() must be executed before that,
        as there can be processors that depend on us finishing our processing
        before they're able to run. For example, a processor that depends on
        `dump_to_zip` must wait until it has finished writing to the local
        filesystem.
        '''
        datapackage = {}
        resources_iterator = iter([])

        with mock.patch(
                'datapackage_pipelines.wrapper.wrapper.stdout') as stdout_mock:

            def finalizer():
                last_call_args = stdout_mock.write.call_args_list[-1]
                assert last_call_args != mock.call('\n')

            spew(datapackage, resources_iterator, finalizer=finalizer)

示例#6

0

显示文件

def main():
    parameters, datapackage, resources = ingest()
    resources = list(resources)
    stats = {}

    mk_individuals = _get_resource_from_datapackage(datapackage, resources,
                                                    'mk_individual')
    votes = _get_resource_from_datapackage(datapackage, resources,
                                           'vote_rslts_kmmbr_shadow')

    mk_individuals = list(mk_individuals)

    stats["total votes"] = 0
    datapackage["resources"][1]["schema"]["fields"].append({
        "name": "mk_individual_id",
        "type": "integer"
    })

    spew(datapackage,
         [mk_individuals,
          get_resource(votes, mk_individuals, stats)], stats)

示例#7

0

显示文件

文件： m_of_justice_resource.py 项目： wsheffel/budgetkey-data-pipelines

def main():
    params, datapackage, res_iter = ingest()

    key = params['key']
    url_key = params['url-key']
    resource_name = params['resource-name']

    resource = {
        'name': resource_name,
        PROP_STREAMING: True,
        'path': 'data/{}.csv'.format(resource_name),
        'schema': {
            'fields': [
                {'name': '{}_Number'.format(key), 'type': 'string'},
                {'name': '{}_Name'.format(key), 'type': 'string'},
                {'name': '{}_Registration_Date'.format(key), 'type': 'string'},
            ]
        }
    }
    datapackage['resources'].append(resource)

    spew(datapackage, [get_entities(url_key)])

示例#8

0

显示文件

文件： download_images.py 项目： OriHoch/nli-data-pipelines

def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}, "sysnum_images": {}}
    resources = list(resources)
    for descriptor in datapackage["resources"]:
        descriptor["schema"] = get_resource_row_image_schema()

    def get_resource(resource, descriptor):
        init_resource_stats(aggregations["stats"], descriptor)
        bucket = get_bucket(*list(
            map(os.environ.get, [
                "GCS_SERVICE_ACCOUNT_B64_KEY", "GCS_IMAGES_BUCKET",
                "GCS_PROJECT"
            ])))
        queue, threads = None, None
        if not os.environ.get("GCS_DISABLE_DOWNLOAD"):
            numthreads = int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS",
                                            "5"))
            poolsize = 20 if numthreads < 50 else int(numthreads / 2)
            logging.info("poolsize={}, numthreads={}".format(
                poolsize, numthreads))
            queue, threads = start_downloader(
                poolsize,
                numthreads,
                worker=partial(download_blob, bucket, aggregations,
                               descriptor["name"]),
                max_retries=5)
        yield from get_images(resource, aggregations, descriptor["name"],
                              bucket, queue)
        if queue:
            stop_downloader(
                queue, threads,
                int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS", "5")))

    def get_resources():
        for resource, descriptor in zip(resources, datapackage["resources"]):
            yield get_resource(resource, descriptor)

    spew(datapackage, get_resources(), aggregations["stats"])

示例#9

0

显示文件

文件： zio_scrape.py 项目： wmil-1946/wikiscraper

def main():
    parameters, datapackage, resources, stats = ingest() + ({}, )
    datapackage['resources'] = [{
        PROP_STREAMING: True,
        "name": "zio",
        "path": "zio.csv",
        "schema": {
            "fields": [{
                "name": "description",
                "type": "string"
            }, {
                "name": "year",
                "type": "year"
            }, {
                "name": "id",
                "type": "string"
            }, {
                "name": "thumb_url",
                "type": "string"
            }, {
                "name": "details_url",
                "type": "string"
            }, {
                "name": "scrape_year",
                "type": "year"
            }, {
                "name": "page_number",
                "type": "integer"
            }, {
                "name": "rownum",
                "type": "integer"
            }, {
                'name': 'error',
                'type': 'string'
            }]
        }
    }]
    spew(datapackage, [get_resource(parameters)], stats)

示例#10

0

显示文件

def main():
    parameters, datapackage, resources = ingest()
    stats = {}
    aggregations = {"stats": stats}
    jinja_env = get_jinja_env()
    committees = {}
    committees_descriptor = None
    for descriptor, resource in zip(datapackage["resources"], resources):
        if descriptor["name"] == "kns_committee":
            committees_descriptor = descriptor
            for committee in resource:
                committees[int(committee["CommitteeID"])] = committee
        elif descriptor["name"] == "kns_committeesession":
            build_meeting_templates(resource, committees, jinja_env,
                                    descriptor, committees_descriptor,
                                    aggregations)
            build_committee_templates(jinja_env, committees,
                                      committees_descriptor, aggregations)
            build_committee_knessets_list_template(jinja_env, committees,
                                                   aggregations)
            build_committees_index_template(jinja_env, committees,
                                            aggregations)
    spew({}, [], stats)

示例#11

0

显示文件

文件： parse-category-explanations.py 项目： wsheffel/budgetkey-data-pipelines

def main():
    params, dp, res_iter = ingest()

    dp['name'] = 'category-explanations'
    dp['resources'] = [{
        'name': 'category-explanations',
        'path': 'data/category-explanations.csv',
        PROP_STREAMING: True,
        'schema': {
            'fields': [
                {
                    'name': 'budget_code',
                    'type': 'string'
                },
                {
                    'name': 'explanation',
                    'type': 'string'
                },
                {
                    'name': 'explanation_short',
                    'type': 'string'
                },
                {
                    'name': 'source',
                    'type': 'string'
                },
            ]
        }
    }]

    spew(dp, [
        itertools.chain(
            process_file('category-explanations.md', 'explanation'),
            process_file('category-explanations-short.md',
                         'explanation_short'),
        )
    ])

示例#12

0

显示文件

文件： bill_law_ministry.py 项目： OriHoch/democracy-inst-law-research

def main():
    parameters, datapackage, resources, stats = ingest() + ({}, )
    bills = {}
    israel_law_bill_ids = {}
    for bill in next(resources):
        bill['law_ministry_ids'] = []
        bills[bill['BillID']] = bill
        if bill['IsraelLawID']:
            for israel_law_id in bill['IsraelLawID']:
                israel_law_bill_ids.setdefault(israel_law_id, [])
                israel_law_bill_ids[israel_law_id].append(bill['BillID'])
    for law_ministry in next(resources):
        for bill_id in israel_law_bill_ids.get(law_ministry['IsraelLawID'],
                                               []):
            if law_ministry['GovMinistryID'] not in bills[bill_id][
                    'law_ministry_ids']:
                bills[bill_id]['law_ministry_ids'].append(
                    law_ministry['GovMinistryID'])
    gov_ministries = {}
    for gov_ministry in next(resources):
        gov_ministries[gov_ministry['GovMinistryID']] = gov_ministry['Name']
    for bill in bills.values():
        ministry_names = set()
        for ministry_id in bill['law_ministry_ids']:
            ministry_names.add(gov_ministries[ministry_id])
        bill['law_ministry_names'] = ', '.join(ministry_names)
    datapackage["resources"] = [datapackage['resources'][0]]
    fields = [{
        'name': 'law_ministry_ids',
        'type': 'array'
    }, {
        'name': 'law_ministry_names',
        'type': 'string'
    }]
    datapackage["resources"][0]['schema']['fields'] += fields
    spew(datapackage, [bills.values()], stats)

示例#13

0

显示文件

文件： load_collections.py 项目： OriHoch/nli-data-pipelines

def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}}
    collections = {}
    for descriptor, resource in zip(datapackage["resources"], resources):
        if descriptor["name"] == "collections":
            collections = list(resource)
        else:
            list(resource)
    datapackage["resources"] = []
    for collection in collections:
        datapackage["resources"].append({
            PROP_STREAMING:
            True,
            "name":
            collection["id"],
            "path":
            "{}.csv".format(collection["id"]),
            "schema": {
                "fields": [{
                    "name": "label",
                    "type": "string"
                }, {
                    "name": "manifest",
                    "type": "string"
                }]
            }
        })

    def get_resource(collection):
        for member in json.loads(requests.get(
                collection["json"]).content)["members"]:
            yield {"label": member["label"], "manifest": member["@id"]}

    spew(datapackage, (get_resource(collection) for collection in collections),
         aggregations["stats"])

示例#14

0

显示文件

def main():
    parameters, datapackage, resources, stats = ingest() + (defaultdict(int),)
    resource_names = [r['name'] for r in datapackage['resources']]
    datapackage['resources'] = [r for r in datapackage['resources'] if r['name'] == 'foi_offices']
    datapackage['resources'][0]['schema']['fields'] += [{'name': 'update_type', 'type': 'string'},
                                                        {'name': 'update_title', 'type': 'string'},
                                                        {'name': 'entity_id', 'type': 'string'},]

    def get_resources():
        existing_entities = {}
        for resource_name, resource in zip(resource_names, resources):
            if resource_name == 'existing_entities':
                for row in get_existing_entities(resource, existing_entities, stats):
                    pass
            elif resource_name == 'foi-groups-matching':
                for row in get_foi_groups_matching(resource, existing_entities, stats):
                    pass
            elif resource_name == 'foi_offices':
                yield get_foi_offices_resource(resource, existing_entities, stats, parameters.get('dry-run'))
            else:
                for row in resource:
                    pass

    spew(datapackage, get_resources(), stats)

示例#15

0

显示文件

def main():
    params, dp, res_iter = ingest()

    os.makedirs('/var/datapackages/sitemaps', exist_ok=True)

    kind = params['kind']
    db_table = params['db-table']
    doc_id = params['doc-id']
    page_title = params['page-title']

    if not dp.get('resources'):
        dp['resources'] = [{
            'name': 'sitemaps',
            'path': 'sitemaps.csv',
            PROP_STREAMING: True,
            'schema': {
                'fields': [{
                    'name': 'filename',
                    'type': 'string'
                }]
            }
        }]

    spew(dp, [process_rows(res_iter, kind, db_table, doc_id, page_title)])

示例#16

0

显示文件

def main():
    parameters, dp, res_iter = ingest()

    connection_string = get_connection_string()

    existing_ids = None
    resource_name = parameters['resource-name']
    input_key_fields = parameters['key-fields']
    input_hash_fields = parameters.get('hash-fields')

    for res in dp['resources']:
        if resource_name == res['name']:
            if input_hash_fields is None:
                input_hash_fields = set(f['name'] for f in res['schema']['fields'])
            input_hash_fields = set(input_hash_fields) - set(input_key_fields)
            if len(input_hash_fields.intersection(STATUS_FIELD_NAMES)) == 0:
                res['schema']['fields'].extend(STATUS_FIELDS)

            db_key_fields = parameters.get('db-key-fields', input_key_fields)
            db_hash_fields = parameters.get('db-hash-fields', input_hash_fields)

            existing_ids = \
                get_all_existing_ids(connection_string,
                                     parameters['db-table'],
                                     db_key_fields,
                                     db_hash_fields)
            break

    assert existing_ids is not None
    logging.info('Found %d ids', len(list(existing_ids.keys())))

    spew(dp, process_resources(res_iter,
                               resource_name,
                               input_key_fields,
                               input_hash_fields,
                               existing_ids))

示例#17

0

显示文件

文件： load_members.py 项目： OriHoch/nli-data-pipelines

def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}}
    resources = list(resources)
    for descriptor in datapackage["resources"]:
        descriptor["schema"]["fields"] = [{
            "name": "doc_id",
            "type": "string"
        }, {
            "name": "system_number",
            "type": "string"
        }, {
            "name": "manifest_url",
            "type": "string"
        }, {
            "name": "manifest_file",
            "type": "string"
        }]

    def get_resources():
        for resource, descriptor in zip(resources, datapackage["resources"]):
            yield (parse_row(row) for row in resource)

    spew(datapackage, get_resources(), aggregations["stats"])

示例#18

0

显示文件

文件： validate_values.py 项目： transpresupuestaria/os-data-importers

    def process_single(resource):
        counter = 0
        nones = dict((c, 0) for c in threshold_columns)
        for row in resource:
            counter += 1
            for column in threshold_columns:
                value = row.get(column)
                if is_empty(value):
                    nones[column] += 1
            for column in allowed_value_columns:
                value = row.get(column)
                if not is_empty(value) and value != 'unknown':
                    if value not in allowed_values[column]:
                        raise ValueError(
                            '%s: Got %r whereas allowed values for this column are %r'
                            % (column, value, allowed_values[column]))
            yield row
        for column in threshold_columns:
            ratio_percent = 100 - (100 * nones[column]) // counter
            if ratio_percent < thresholds[column]:
                raise ValueError(
                    '%s: Got %d good values (out of %d), which is %d%% (below the threshold of %d%%)'
                    % (column, counter - nones[column], counter, ratio_percent,
                       thresholds[column]))

    for resource_ in resources:
        yield process_single(resource_)


spew(datapackage_, process(resources_))

示例#19

0

显示文件

文件： sample.py 项目： yoseft/budgetkey-data-pipelines

from datapackage_pipelines.wrapper import spew, ingest

parameters, datapackage, res_iter = ingest()
res_name = parameters.get('resource', datapackage['resources'][0]['name'])


def show_sample(res):
    logging.info('SAMPLE OF LINES from %s', res.spec['name'])
    for i, row in enumerate(res):
        if i < 10:
            if isinstance(row, LazyJsonLine):
                logging.info('#%s: %s', i, row._evaluate())
            else:
                logging.info('#%s: %r', i, row)
        yield row


def process_resources(res_iter_):
    for res in res_iter_:
        logging.info('? from %s', res.spec['name'])
        if res.spec['name'] == res_name:
            yield show_sample(res)
        else:
            yield res


logging.info(json.dumps(datapackage, indent=2))

spew(datapackage, process_resources(res_iter))

示例#20

0

显示文件

    logging.info('Subprocess: "' + ' '.join(command_line_args) + '"')

    try:
        command_line_process = subprocess.Popen(
            command_line_args,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )

        with command_line_process.stdout:
            log_subprocess_output(command_line_process.stdout)
    except (OSError, subprocess.CalledProcessError) as exception:
        logging.info('Exception occured: ' + str(exception))
        logging.info('Subprocess failed')
        raise exception
    else:
        # no exception was raised
        logging.info('Subprocess finished')

    return True


parameters, datapackage, res_iter = ingest()

run_shell_command(
    parameters["arguments"]
)

spew(datapackage, res_iter)

示例#21

0

显示文件

        fields_to_keep = [f['name'] for f in fields]
        fields.extend(extra_keys)
        fields.append(extra_value)
        resource['schema']['fields'] = fields
    return unpivot_fields_without_regex, fields_to_keep


def unpivot(rows, fields_to_unpivot_, fields_to_keep_):
    for row in rows:
        for unpivot_field in fields_to_unpivot_:
            new_row = copy.deepcopy(unpivot_field['keys'])
            for field in fields_to_keep_:
                new_row[field] = row[field]
            new_row[extra_value['name']] = row.get(unpivot_field['name'])
            yield new_row


def process_resources(resource_iterator_, fields_to_unpivot, fields_to_keep):
    for resource in resource_iterator_:
        spec = resource.spec
        if not resources.match(spec['name']):
            yield resource
        else:
            yield unpivot(resource, fields_to_unpivot, fields_to_keep)


old_fields, keep_fields = process_datapackage(datapackage)

spew(datapackage, process_resources(resource_iterator, old_fields, keep_fields))

示例#22

0

显示文件

from datapackage_pipelines.wrapper import spew, ingest
import time, logging, datetime, sys


def filter_resource(resource, sleep_seconds):
    yield from resource
    time.sleep(sleep_seconds)


def filter_resources(datapackage, resources, parameters):
    input_resource_name = parameters.get("resource")
    sleep_seconds = float(parameters.get(
        "sleep-seconds", 2))  # sleep 2 seconds between resources
    for resource_descriptor, resource in zip(datapackage["resources"],
                                             resources):
        if not input_resource_name or input_resource_name == resource_descriptor[
                "name"]:
            logging.info("throttling resource {}: sleep_seconds={}".format(
                resource_descriptor["name"], sleep_seconds))
            yield filter_resource(resource, sleep_seconds)
        else:
            yield resource


parameters, datapackage, resources = ingest()
spew(datapackage, filter_resources(datapackage, resources, parameters))

示例#23

0

显示文件

文件： collect_all_sources.py 项目： os-data/eu-structural-funds

userid = gobble.user.User().id
for dirpath, dirnames, filenames in os.walk('.'):
    if dirpath == '.':
        continue
    if FILENAME in filenames:
        pipeline = yaml.load(open(os.path.join(dirpath, FILENAME)))
        dataset_name = pipeline[list(pipeline.keys())[0]]['pipeline'][0]['parameters']['datapackage']['name']
        url_base = 'http://datastore.openspending.org/{}/{}'.format(userid, dataset_name)
        resp = requests.get(url_base+'/datapackage.json')
        if resp.status_code == 200:
            datapackage_json = resp.json()
            if len(country) > 0:
                if datapackage_json.get('geo', {}).get('country_code', 'xx').lower() != country:
                    continue
            resource = datapackage_json['resources'][0]
            resource_url = '{}/{}'.format(url_base, resource['path'])
            resources.append({
                PROP_STREAMED_FROM: resource_url,
                'path': PATH_PLACEHOLDER,
                'name': dataset_name,
                'encoding': 'utf-8',
                'delimiter': ',',
                'doublequote': True,
                'quotechar': '"',
                'skipinitialspace': False
            })
            logging.error(resource_url)

spew(datapackage, [])

示例#24

0

显示文件

文件： base_processor.py 项目： cappatar/knesset-data-pipelines

 def main(cls):
     from datapackage_pipelines.wrapper import ingest, spew
     spew(*cls(*ingest()).spew())

示例#25

0

显示文件

文件： update_fields.py 项目： os-data/eu-structural-funds

"""Map the raw columns names to fiscal fields where indicated."""

import logging
from datapackage_pipelines.wrapper import ingest, spew
from common.utilities import get_fiscal_field_names


def update_datapackage(datapackage):
    valid_fiscal_fields = get_fiscal_field_names()
    for resource in datapackage['resources']:
        for field in resource['schema']['fields']:
            if field['maps_to'] in valid_fiscal_fields:
                field['name'] = field.pop('maps_to')
            else:
                logging.info('Unmapped = %s', field['name'])
    return


_, datapackage_, resources_ = ingest()
spew(update_datapackage(datapackage_), resources_)

示例#26

0

显示文件

文件： dump_to_html.py 项目： cappatar/knesset-data-pipelines

    return html


def filter_resource(descriptor, data, stats):
    for row in data:
        stats[descriptor["name"]] += 1
        yield row

def filter_resources(datapackage, resources, parameters, stats):
    tables = []
    for resource_descriptor, resource_data in zip(datapackage["resources"], resources):
        schema = resource_descriptor["schema"]
        stats[resource_descriptor["name"]] = 0
        tables.append(_get_schema_table(resource_descriptor["name"], schema["fields"], schema["primaryKey"]))

        yield filter_resource(resource_descriptor, resource_data, stats)

    html = """<html><head><meta charset="UTF-8"></head><body>{tables}</body></html>""".format(tables="".join(tables))

    save_schema = parameters.get("save-schema", DEFAULT_SAVE_SCHEMA)
    if save_schema:
        save_schema_html = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="html")
        save_schema_json = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="json")

        s3 = object_storage.get_s3()
        object_storage.write(s3, parameters["bucket"], save_schema_html, html, public_bucket=True)
        object_storage.write(s3, parameters["bucket"], save_schema_json, json.dumps(datapackage["resources"], indent=2, ensure_ascii=False), public_bucket=True)

stats = {}
spew(datapackage, filter_resources(datapackage, resources, parameters, stats), stats)

示例#27

0

显示文件

文件： map_values.py 项目： os-data/eu-structural-funds

    for key, lookup in lookup_tables.items():
        if row[key] in lookup:
            row[key] = lookup[row[key]]
        else:
            warning('%s mapped to None because no alias was found', row[key])
            row[key] = None
    return row


def build_lookup_tables(mappings):
    """Build the lookup tables."""

    def lookup_table(mapping):
        for key, aliases in mapping.items():
            for alias in aliases:
                yield alias, key

    return {
        mapping['field']:
            dict(lookup_table(mapping['mapping']))
        for mapping in mappings
        }


if __name__ == '__main__':
    parameters, _, resources = ingest()
    lookup_tables_ = build_lookup_tables(parameters['mappings'])
    new_resources = process(resources, map_aliases,
                            lookup_tables=lookup_tables_)
    spew(_, new_resources)

示例#28

0

显示文件

文件： committee_meeting_attendees_mks_stats.py 项目： tzoof/knesset-data-pipelines

                mk_attendance.append(mk_aggs)
            except Exception:
                logging.exception("Failed to process mk_individual name {}".format(mk_name))
                raise
        else:
            raise Exception("Failed to find mk_individual name for mk_individual id {}".format(mk_individual["mk_individual_id"]))

meeting_aggs_fields = [{"name": "knesset_num", "type": "integer"},
                       {"name": "committee_id", "type": "integer"},
                       {"name": "committee_name", "type": "string"},
                       {"name": "meeting_start_date", "type": "datetime"},
                       {"name": "meeting_topics", "type": "string"}, ]

datapackage["resources"] = []

datapackage["resources"] += [{"name": "errors", "path": "errors.csv", PROP_STREAMING: True,
                              "schema": {"fields": [{"name": "error", "type": "string"}, ]}}]

datapackage["resources"] += [{PROP_STREAMING: True,
                              "name": "mk_attendance",
                              "path": "mk_attendance.csv",
                              "schema": {"fields": meeting_aggs_fields + [{"name": "mk_id", "type": "integer"},
                                                                          {"name": "mk_name", "type": "string"},
                                                                          {"name": "mk_membership_committee_names",
                                                                           "type": "string"},
                                                                          {"name": "mk_faction_id", "type": "integer"},
                                                                          {"name": "mk_faction_name", "type": "string"},
                                                                          ]}}]

spew(datapackage, [errors, mk_attendance])

示例#29

0

显示文件

文件： mt_malta_scraper.py 项目： os-data/eu-structural-funds


def get_project_urls():
    """Return the complete list of project URLS."""

    counter = 0
    paths = []

    while True:
        counter += 1

        project = PAGINATION_URL.format(counter=counter)
        response = session.get(project)

        if response.text:
            doc = fromstring(response.content)
            more_links = doc.findall(PROJECT_URLS_XPATH)
            more_paths = list(map(lambda x: x.get('href'), more_links))
            paths.extend(more_paths)
            info('Collected %s urls on page %s', len(more_paths), counter)

        else:
            return paths


if __name__ == '__main__':
    _, datapackage, _ = ingest()
    project_paths = get_project_urls()
    project_rows = scrape_projects(project_paths)
    spew(datapackage, [project_rows])

示例#30

0

显示文件

文件： add_row_id.py 项目： os-data/eu-structural-funds

"""A processor to inject constant values into the data."""

from datapackage_pipelines.wrapper import ingest, spew

row_count = 0


def process_rows(prefix, rows):
    global row_count
    for row in rows:
        row['internal_id'] = '{}-{}'.format(prefix, row_count)
        yield row
        row_count += 1


def process(prefix, resources):
    for resource in resources:
        yield process_rows(prefix, resource)


if __name__ == '__main__':
    """Ingest, process and spew out."""

    parameters_, datapackage_, resources_ = ingest()

    spew(datapackage_, process(parameters_['prefix'], resources_))

示例#31

0

显示文件

文件： load_fiscal_datapackage.py 项目： os-data/eu-structural-funds

"""Grab the source description and convert it into a datapackage"""

import json
import logging

from datapackage_pipelines.wrapper import ingest, spew
from common.config import JSON_FORMAT
from common.utilities import get_fiscal_datapackage


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    fiscal_datapackage = get_fiscal_datapackage(source=datapackage)
    fiscal_datapackage_as_json = json.dumps(fiscal_datapackage, **JSON_FORMAT)
    logging.debug('Loaded fiscal datapackage:\n%s', fiscal_datapackage_as_json)
    spew(fiscal_datapackage, resources)

示例#32

0

显示文件

文件： scraper_template.py 项目： os-data/eu-structural-funds

"""The template for writing PDF and web scrapers."""

from datapackage_pipelines.wrapper import ingest, spew
from logging import debug


def scrape_beneficiaries(**params):
    """Return a generator of beneficiaries.

    Each beneficiary is a dictionary whose keys match the fields described
    in source.description.yaml. Parameters come from pipeline-specs.yaml.
    """

    debug('%s', **params)
    beneficiaries = [
        {'field1': 'foo', 'field2': 'spam'},
        {'field1': 'bar', 'field2': 'eggs'},
    ]
    for beneficiary in beneficiaries:
        yield beneficiary


if __name__ == '__main__':
    parameters, datapackage, _ = ingest()
    rows = scrape_beneficiaries(**parameters)
    spew(datapackage, [rows])

示例#33

0

显示文件

文件： parse_dates.py 项目： os-data/eu-structural-funds

from arrow.parser import ParserError
from datapackage_pipelines.wrapper import ingest, spew
from common.utilities import process


def parse_currencies(row):
    """Clean up and convert currency fields to floats."""

    date_columns = (
        'Datum van laatste bijwerking',
        'Einddatum',
        'Begindatum'
    )
    for key in date_columns:
        try:
            row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm')
        except ParserError:
            if row[key] != '0000-00-00 00:00:00':
                message = 'Could not parse %s to a date, returning None'
                logging.warning(message, row[key])

            row[key] = None

    return row


if __name__ == '__main__':
    parameters, datapackage_, resources = ingest()
    new_resources_ = process(resources, parse_currencies)
    spew(datapackage_, new_resources_)

示例#34

0

显示文件

    'fields': [{
        'name': h,
        'type': 'string'
    } for h in headers]
}

datapackage['resources'].append(resource)


def process_resources(res_iter, datapackage, domain):
    def get_latest_row(first):
        latest_row = None
        my_rows = []
        for row in first:
            if row['domain'] == domain and row['source'] == 'discourse':
                latest_row = row
            my_rows.append(row)
        return latest_row, iter(my_rows)

    if len(datapackage['resources']):
        if datapackage['resources'][0]['name'] == 'latest-project-entries':
            latest_row, latest_iter = get_latest_row(next(res_iter))
            yield latest_iter
        else:
            latest_row = None
    yield from res_iter
    yield discourse_collector(domain, latest_row)


spew(datapackage, process_resources(res_iter, datapackage, domain))

示例#35

0

显示文件

文件： cast_to_fiscal_schema.py 项目： os-data/eu-structural-funds

        yield field_['name'], converters[field_['type']]


converter = dict(get_fiscal_types())
dump = {k: v.__name__ for k, v in converter.items()}
logging.debug('Fiscal type casting: \n%s', json.dumps(dump, indent=4))


def cast_values(row):
    """Cast values to fiscal types."""

    for key, value in row.items():
        if value:
            try:
                if value is None or (type(value) is str and len(value.strip()) == 0):
                    row[key] = None
                else:
                    row[key] = converter[key](value)
            except (ValueError, arrow.parser.ParserError):
                message = 'Could not cast %s = %s to %s, returning None' % (key, row[key], converter[key])
                logging.warning(message)
                assert False, message

    return row


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    new_resources = process(resources, cast_values)
    spew(datapackage, new_resources)

示例#36

0

显示文件

        all_attendee_names = set()
        for attendee_names in (meeting["mks"], meeting["invitees"], meeting["legal_advisors"], meeting["manager"]):
            if attendee_names and len(attendee_names) > 0:
                for attendee_name in attendee_names:
                    if type(attendee_name) == str:
                        all_attendee_names.add(attendee_name)
                    else:
                        all_attendee_names.add(attendee_name["name"])
        attended_mk_individual_ids = set()
        for attendee_name in all_attendee_names:
            for mk_individual in filter(lambda mk: meeting["KnessetNum"] in mk["knesset_nums"],
                                        map(get_mk_individual, mk_individuals)):
                if meeting["KnessetNum"] in mk_individual["knesset_nums"]:
                    name_equals, name_in = False, False
                    for name in mk_individual["mk_names"]:
                        if name == attendee_name:
                            name_equals += 1
                        if name in attendee_name:
                            name_in += 1
                    if name_equals or name_in:
                        attended_mk_individual_ids.add(mk_individual["mk_individual_id"])
        meeting["attended_mk_individual_ids"] = list(attended_mk_individual_ids)
        yield meeting


datapackage["resources"] = [datapackage["resources"][1]]
datapackage["resources"][0]["schema"]["fields"] += [{"name": "attended_mk_individual_ids", "type": "array"}]


spew(datapackage, [get_resource()])

示例#37

0

显示文件

文件： concatenate_all_pipelines.py 项目： os-data/eu-structural-funds

            args = filename, format_data_sample(stream)
            info('Concatenated %s:\n%s', *args)

    info('Done concatenating %s files', nb_files)


def assemble_fiscal_datapackage():
    """Assemble the fiscal datapackage for the concatenated dataset."""

    with open(FISCAL_METADATA_FILE) as stream:
        fdp = yaml.load(stream.read())

    with open(FISCAL_MODEL_FILE) as stream:
        fdp['model'] = yaml.load(stream.read())

    with open(FISCAL_SCHEMA_FILE) as stream:
        fdp['resources'][0]['schema'] = yaml.load(stream.read())

    message = 'Fiscal datapackage: \n%s'
    info(message, format_to_json(fdp))

    return fdp


if __name__ == '__main__':
    parameters, datapackage, _ = ingest()
    datapackage = assemble_fiscal_datapackage()
    datasets = collect_local_datasets(**parameters)
    resource = concatenate(datasets, **parameters)
    spew(datapackage, [resource])

示例#38

0

显示文件

                                          collated_field_name)] = {
                                              'fields': inner_fields
                                          }


def val(v):
    if isinstance(v, Decimal):
        v = float(v)
    elif isinstance(v, date):
        v = v.isoformat()
    return v


def process_resource(res):
    for row in res:
        inner = dict((k, val(v)) for k, v in row.items() if k not in key)
        outer = dict((k, v) for k, v in row.items() if k in key)
        outer[collated_field_name] = inner
        yield outer


def process_resources(res_iter_):
    for res in res_iter_:
        if resource_matcher.match(res.spec['name']):
            yield process_resource(res)
        else:
            yield res


spew(dp, process_resources(res_iter))

示例#39

0

显示文件

文件： ingest_local_file.py 项目： os-data/eu-structural-funds

            if index != 1:
                yield index, headers, values

    @staticmethod
    def _fixed_points(rows):
        """Convert floats to 2-digit fixed precision strings"""

        for index, headers, values in rows:
            values = [
                '%.2f' % value if type(value) is float else value
                for value in values
            ]
            yield index, headers, values


XLSXIngestor = XLSIngestor


def ingest_resources(datapackage):
    """Ingest each resource one by one into the pipeline."""

    for resource in datapackage['resources']:
        ingestor = BaseIngestor.load(resource)
        yield ingestor.rows


if __name__ == '__main__':
    _, datapackage_, _ = ingest()
    resources = list(ingest_resources(datapackage_))
    spew(datapackage_, resources)

示例#40

0

显示文件

    headers = [hdr_num, hdr_name, hdr_reg_date]
    for data in datums:
        yield dict(zip(headers, treat(data)))


resource = {
    'name': resource_name,
    PROP_STREAMING: True,
    'path': 'data/{}.csv'.format(resource_name),
    'schema': {
        'fields': [
            {
                'name': hdr_num,
                'type': 'string'
            },
            {
                'name': hdr_name,
                'type': 'string'
            },
            {
                'name': hdr_reg_date,
                'type': 'string'
            },
        ]
    }
}

datapackage['resources'].append(resource)

spew(datapackage, [get_entities()])

示例#41

0

显示文件

文件： concatenate_identical_resources.py 项目： os-data/eu-structural-funds

"""A processor to concatenate resources that have a common set of fields."""

from datapackage_pipelines.wrapper import ingest, spew


def concatenate(resources):
    """Concatenate multiple resources."""

    for resource in resources:
        for row in resource:
            yield row


if __name__ == '__main__':
    _, datapackage, resources_ = ingest()
    single_resource = concatenate(resources_)
    datapackage['resources'] = [datapackage['resources'][0]]
    spew(datapackage, [single_resource])

示例#42

0

显示文件

resources = ResourceMatcher(parameters.get('resources'), datapackage)
ignore_missing = parameters.get('ignore-missing', False)
limit_rows = parameters.get('limit-rows', -1)

new_resource_iterator = []
for resource in datapackage['resources']:

    if streamable(resource):
        url = resource[PROP_STREAMED_FROM]

        name = resource['name']
        if not resources.match(name):
            continue

        path = get_path(resource)
        if path is None or path == PATH_PLACEHOLDER:
            path = os.path.join('data', name + '.csv')
            resource['path'] = path

        resource[PROP_STREAMING] = True

        rows = stream_reader(resource, url, ignore_missing or url == "",
                             limit_rows, resource.pop('http_headers', None))

        new_resource_iterator.append(rows)

    elif streaming(resource):
        new_resource_iterator.append(next(resource_iterator))

spew(datapackage, new_resource_iterator)

示例#43

0

显示文件

文件： load_from_es.py 项目： OriHoch/dbs-back

                                           "UnitText1_En": doc["_source"]["UnitText1"].get("En") if doc["_source"].get("UnitText1") else "",
                                           "UnitText1_He": doc["_source"]["UnitText1"].get("He") if doc["_source"].get("UnitText1") else "",
                                           "Header_En": doc["_source"]["Header"].get("En") if doc["_source"].get("Header") else "",
                                           "Header_He": doc["_source"]["Header"].get("He") if doc["_source"].get("Header") else "",
                                           })
                yield from filtered_row
            else:
                break


datapackage = {"name": "_",
               "resources": [{"name": "es_data", "path": "es_data.csv", PROP_STREAMING: True,
                              "schema": {"fields": [{"name": "index", "type": "string"},
                                                    {"name": "doc_type", "type": "string"},
                                                    {"name": "doc_id", "type": "string"},
                                                    {"name": "UnitId", "type": "string"},
                                                    {"name": "RightsCode", "type": "string"},
                                                    {"name": "RightsDesc", "type": "string"},
                                                    {"name": "StatusDesc", "type": "string"},
                                                    {"name": "DisplayStatusDesc", "type": "string"},
                                                    {"name": "UnitType", "type": "string"},
                                                    {"name": "Slug_En", "type": "string"},
                                                    {"name": "Slug_He", "type": "string"},
                                                    {"name": "UnitText1_En", "type": "string"},
                                                    {"name": "UnitText1_He", "type": "string"},
                                                    {"name": "Header_En", "type": "string"},
                                                    {"name": "Header_He", "type": "string"}]}}]}


spew(datapackage, get_resources(), stats)

示例#44

0

显示文件

文件： manage-revisions.py 项目： slallum/budgetkey-data-pipelines

def main():
    parameters, dp, res_iter = ingest()

    connection_string = get_connection_string()

    existing_ids = None
    resource_name = parameters['resource-name']
    input_key_fields = parameters['key-fields']
    input_hash_fields = parameters.get('hash-fields')
    prefix = parameters.get('prefix', '')

    STATUS_FIELDS = [
        {
            'name': prefix + '__last_updated_at',
            'type': 'datetime'
        },
        {
            'name': prefix + '__last_modified_at',
            'type': 'datetime'
        },
        {
            'name': prefix + '__created_at',
            'type': 'datetime'
        },
        {
            'name': prefix + '__is_new',
            'type': 'boolean'
        },
        {
            'name': prefix + '__is_stale',
            'type': 'boolean'
        },
        {
            'name': prefix + '__staleness',
            'type': 'integer'
        },
        {
            'name': prefix + '__next_update_days',
            'type': 'integer'
        },
        {
            'name': prefix + '__hash',
            'type': 'string'
        },
    ]
    STATUS_FIELD_NAMES = list(f['name'] for f in STATUS_FIELDS)

    for res in dp['resources']:
        if resource_name == res['name']:
            if input_hash_fields is None:
                input_hash_fields = set(f['name']
                                        for f in res['schema']['fields'])
            input_hash_fields = set(input_hash_fields) - set(input_key_fields)
            if len(input_hash_fields.intersection(STATUS_FIELD_NAMES)) == 0:
                res['schema']['fields'].extend(STATUS_FIELDS)
            input_hash_fields = set(input_hash_fields) - set(
                STATUS_FIELD_NAMES)

            db_key_fields = parameters.get('db-key-fields', input_key_fields)
            db_hash_fields = parameters.get('db-hash-fields',
                                            input_hash_fields)

            existing_ids = \
                get_all_existing_ids(connection_string,
                                     parameters['db-table'],
                                     db_key_fields,
                                     [
                                         prefix + '__last_updated_at',
                                         prefix + '__next_update_days',
                                         prefix + '__hash',
                                         prefix + '__created_at',
                                     ]
                                    )
            break

    assert existing_ids is not None
    logging.info('Found %d ids', len(list(existing_ids.keys())))

    spew(
        dp,
        process_resources(res_iter, resource_name, input_key_fields,
                          input_hash_fields, existing_ids, prefix))

示例#45

0

显示文件

文件： validate_values.py 项目： os-data/eu-structural-funds


def process(resources):
    def process_single(resource):
        counter = 0
        nones = dict((c, 0) for c in threshold_columns)
        for row in resource:
            counter += 1
            for column in threshold_columns:
                value = row.get(column)
                if is_empty(value):
                    nones[column] += 1
            for column in allowed_value_columns:
                value = row.get(column)
                if not is_empty(value) and value != 'unknown':
                    if value not in allowed_values[column]:
                        raise ValueError('%s: Got %r whereas allowed values for this column are %r' %
                                         (column, value, allowed_values[column]))
            yield row
        for column in threshold_columns:
            ratio_percent = 100 - (100*nones[column])//counter
            if ratio_percent < thresholds[column]:
                raise ValueError('%s: Got %d good values (out of %d), which is %d%% (below the threshold of %d%%)' %
                                 (column, counter-nones[column], counter, ratio_percent, thresholds[column]))

    for resource_ in resources:
        yield process_single(resource_)


spew(datapackage_, process(resources_))

示例#46

0

显示文件

文件： add_github_resource.py 项目： pombredanne/measure

# ISSUES & PR COUNTS
base_issue_url = '{}/search/issues?q=repo:{}'.format(base_url,
                                                     current_repo_name)

row['open_prs'] = _get_issue_count_for_request(
    '{}%20state:open%20is:pr'.format(base_issue_url))
row['closed_prs'] = _get_issue_count_for_request(
    '{}%20state:closed%20is:pr'.format(base_issue_url))
row['open_issues'] = _get_issue_count_for_request(
    '{}%20state:open%20is:issue'.format(base_issue_url))
row['closed_issues'] = _get_issue_count_for_request(
    '{}%20state:closed%20is:issue'.format(base_issue_url))

resource = {'name': name, 'path': 'data/{}.csv'.format(name)}

# Temporarily set all types to string, will use `set_types` processor in
# pipeline to assign correct types
resource['schema'] = {
    'fields': [{
        'name': h,
        'type': 'string'
    } for h in row.keys()]
}

datapackage['resources'].append(resource)

resource_content.append(row)

spew(datapackage, itertools.chain(res_iter, [resource_content]))

示例#47

0

显示文件

文件： base.py 项目： cappatar/knesset-data-pipelines

 def main(cls):
     # can be used like this in datapackage processor files:
     # if __name__ == '__main__':
     #      Processor.main()
     spew(*cls(*ingest()).spew())