def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    u = Uploader("gp-survey")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0
Пример #2
0
def add_metadata_to_pp_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ["GP", "Population"]
        title = metadata['title']
        #begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        #ends = datetime.date(begins.year + 1, 3, 31)
        #metadata['coverage_start_date'] = begins.isoformat()
        #metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'Quarterly'

        print metadata['title']
        u = Uploader("pp")

        for resource in metadata['sources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
            resource['name'] = resource['url'].split('/')[-1]
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
Пример #3
0
def add_metadata_to_ascof_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework']
        title = metadata['title']
        match = re.search('(\d{4})-(\d{2})', title)
        begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        ends = datetime.date(begins.year + 1, 3, 31)
        metadata['coverage_start_date'] = begins.isoformat()
        metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'yearly'
        metadata['title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(match.group(1), match.group(2))

        u = Uploader("ascof")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
def add_metadata_to_pp_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ["GP", "Population"]
        title = metadata['title']
        #begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        #ends = datetime.date(begins.year + 1, 3, 31)
        #metadata['coverage_start_date'] = begins.isoformat()
        #metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'Quarterly'

        print metadata['title']
        u = Uploader("pp")

        for resource in metadata['sources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
            resource['name'] = resource['url'].split('/')[-1]
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
Пример #5
0
def add_metadata_to_ascof_datasets():
    metadata_file = DATA_DIR/'dataset.metadata.json'
    metadata = metadata_file.json_load()

    metadata['tags'] = ['PHOF', 'Public Health Outcomes Framework']
    metadata['title'] ='PHOF - Public Health Outcomes Framework'
    metadata['frequency'] = 'yearly'
    metadata['summary'] = PHOF_SUMMARY
    metadata['source'] = 'http://www.phoutcomes.info/public-health-outcomes-framework'

    metadata['coverage_start_date'] = '2000-01-01'
    metadata['coverage_end_date'] = '2013-12-31'

    u = Uploader("phof")
    for resource in metadata['resources']:
        filename = filename_for_resource(resource)
        path = DATA_DIR / filename

        download_file(resource['url'], path)
        print "Uploading to S3"
        url = u.upload(path)
        resource['url'] = url
    u.close()


    metadata_file.truncate()
    metadata_file << json.dumps(metadata, indent=2)
    return
def add_metadata_to_qof_datasets():
    u = Uploader("nshof")

    f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json")
    datasets = json.load(open(f))

    for metadata in datasets:
        metadata['tags'] = ['QOF', 'Quality Outcomes Framework']
        title = metadata['title']
        #metadata['frequency'] = 'yearly'
        #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2))

        resources = []
        for resource in metadata['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]
            resource['url_type'] = ''

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        metadata['resources'] = resources

    u.close()

    json.dump(
        datasets,
        open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w"))
    return
Пример #7
0
def add_metadata_to_ascof_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework']
        title = metadata['title']
        match = re.search('(\d{4})-(\d{2})', title)
        begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        ends = datetime.date(begins.year + 1, 3, 31)
        metadata['coverage_start_date'] = begins.isoformat()
        metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'yearly'
        metadata[
            'title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(
                match.group(1), match.group(2))

        u = Uploader("ascof")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
Пример #8
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    metadata_file = DATA_DIR / 'dataset.metadata.json'
    datasets = json.load(open(metadata_file, 'r'))

    u = Uploader("ods")
    unzipper = Unzipper()
    for dataset in datasets:
        has_zip = False

        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url

            if resource['format'].upper() == 'ZIP':
                has_zip = True

        if has_zip:
            print "Processing ZIP files in dataset"
            print '*' * 30
            unzipper.unzip(dataset)
            print '*' * 30
    u.close()

    json.dump(datasets, open(metadata_file, 'w'))
Пример #9
0
def add_metadata_to_qof_datasets():
    u = Uploader("nshof")

    f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json")
    datasets = json.load(open(f))

    for metadata in datasets:
        metadata['tags'] = ['QOF', 'Quality Outcomes Framework']
        title = metadata['title']
        #metadata['frequency'] = 'yearly'
        #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2))

        resources = []
        for resource in metadata['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]
            resource['url_type'] = ''

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        metadata['resources'] = resources


    u.close()

    json.dump(datasets, open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w"))
    return
Пример #10
0
def add_metadata_to_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['Mental Health']

        u = Uploader("mhmds")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
def add_metadata_to_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['Mental Health']

        u = Uploader("mhmds")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
Пример #12
0
def retrieve_qof_datasets(datasets):
    results = []

    u = Uploader("qof")
    for dataset in datasets:
        print dataset['title']
        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
        results.append(dataset)

    u.close()

    metadata_file = DATA_DIR / 'dataset.metadata.json'
    if metadata_file:
        metadata_file.truncate()
    metadata_file << json.dumps(results, indent=2)
Пример #13
0
def retrieve_qof_datasets(datasets):
    results = []

    u = Uploader("qof")
    for dataset in datasets:
        print dataset['title']
        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
        results.append(dataset)

    u.close()

    metadata_file = DATA_DIR/'dataset.metadata.json'
    if metadata_file:
        metadata_file.truncate()
    metadata_file << json.dumps(results, indent=2)
def main(workspace):
    DATA_DIR = ffs.Path(workspace)
    datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json'))

    u = Uploader("ccgois")
    for dataset in datasets:
        resources = []
        for resource in dataset['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        dataset['resources'] = resources
    u.close()

    json.dump(datasets, open(DATA_DIR / 'ccgois_indicators.json', 'w'))
Пример #15
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    tag_list = ["Statistics"]
    u = Uploader("stats")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..adding tags"
        tags = dataset.get('tags', [])
        for t in tag_list:
            if not t in tags:
                tags.append(t)
        dataset['tags'] = tags

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            try:
                download_file(resource['url'], path)
            except:
                continue
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resource['url_type'] = ''  # make sure we zap historical uploads

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0
Пример #16
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    tag_list = ["Statistics"]
    u = Uploader("stats")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..adding tags"
        tags = dataset.get('tags', [])
        for t in tag_list:
            if not t in tags:
                tags.append(t)
        dataset['tags'] = tags

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            try:
                download_file(resource['url'], path)
            except:
                continue
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resource['url_type'] = ''  # make sure we zap historical uploads

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0
Пример #17
0
def download_and_hash_file(dataset_name, url):
    folder = DATA_DIR / dataset_name
    folder.mkdir()

    hash_of_url = hashlib.sha224(url).hexdigest()
    download_file(url, os.path.join(folder, hash_of_url))
def download_and_hash_file(dataset_name, url):
    folder = DATA_DIR / dataset_name
    folder.mkdir()

    hash_of_url = hashlib.sha224(url).hexdigest()
    download_file(url, os.path.join(folder, hash_of_url))
Пример #19
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'

    org = dgu.action.organization_show(id=TARGET_ORGANISATION)

    if not _org_existsp(TARGET_ORGANISATION):
        catalogue.action.organization_create(
            name=org['name'],
            title=org['title'],
            description=org['description'],
            image_url=org['image_display_url'])

    print "Found {0} datasets on source".format(len(org['packages']))

    for package in org['packages']:
        print 'uploading', package['title'].encode('utf8')
        dataset_dir = DATA_DIR / package['name']
        # Get the dataset from DGU
        dataset = dgu.action.package_show(id=package['name'])
        del dataset['id']

        # Set the new owning organisation
        dataset['owner_org'] = org['name']

        u = Uploader("hqip")
        for resource in dataset['resources']:
            resource['name'] = resource['description']
            if resource['format'] == "HTML":
                continue
            if resource['url'].startswith('hhttps'):
                resource['url'] = resource['url'].replace('hhttps', 'https')

            if 'cache_filepath' in resource:
                del resource['cache_filepath']
            if 'tracking_summary' in resource:
                del resource['tracking_summary']

            filename = filename_for_resource(resource)

            datafile = dataset_dir / filename
            print 'downloading', resource['url'], 'as', datafile

            try:
                download_file(resource['url'], datafile)
                print "Uploading to S3"
                url = u.upload(datafile)
                resource['url'] = url
            except:
                print '***' * 30
                print "Failed to download: ", resource['url']
        u.close()

        # Add a nice tag so we can find them all again
        dataset['tags'].append({'name': 'HQIP'})
        print 'Owner org is', org['name']
        try:
            extras = []
            if 'temporal_coverage-from' in dataset:
                extras.append(
                    dict(key='coverage_start_date',
                         value=format_date(dataset['temporal_coverage-from'])))
            if 'temporal_coverage' in dataset:
                extras.append(
                    dict(key='coverage_end_date',
                         value=format_date(dataset['temporal_coverage-to'])))
            if 'frequency' in dataset:
                extras.append(
                    dict(key='frequency', value=dataset['update_frequency']))

            new_dataset = Dataset.create_or_update(
                name=dataset['name'],
                title=dataset['title'],
                state='active',
                visibility='private',
                license_id='uk-ogl',
                notes=dataset['notes'],
                origin=dataset['url'],
                tags=dataset['tags'],
                resources=dataset['resources'],
                owner_org=org['name'],
                extras=extras)
            print "Created {}".format(dataset['name'])
        except ValueError as e:
            print 'skipping because error', e
            continue
        except ValidationError:
            raise
            print "Failed to upload {}".format(dataset['name'])
Пример #20
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'

    org = dgu.action.organization_show(id=TARGET_ORGANISATION)

    if not _org_existsp(TARGET_ORGANISATION):
        catalogue.action.organization_create(
            name=org['name'],
            title=org['title'],
            description=org['description'],
            image_url=org['image_display_url']
        )

    print "Found {0} datasets on source".format(len(org['packages']))

    for package in org['packages']:
        print 'uploading', package['title'].encode('utf8')
        dataset_dir = DATA_DIR/package['name']
        # Get the dataset from DGU
        dataset = dgu.action.package_show(id=package['name'])
        del dataset['id']

        # Set the new owning organisation
        dataset['owner_org'] = org['name']

        u = Uploader("hqip")
        for resource in dataset['resources']:
            resource['name'] = resource['description']
            if resource['format'] == "HTML":
                continue
            if resource['url'].startswith('hhttps'):
                resource['url'] = resource['url'].replace('hhttps', 'https')

            if 'cache_filepath' in resource:
                del resource['cache_filepath']
            if 'tracking_summary' in resource:
                del resource['tracking_summary']

            filename = filename_for_resource(resource)

            datafile = dataset_dir/filename
            print 'downloading', resource['url'], 'as', datafile

            try:
                download_file(resource['url'], datafile)
                print "Uploading to S3"
                url = u.upload(datafile)
                resource['url'] = url
            except:
                print '***' * 30
                print "Failed to download: ", resource['url']
        u.close()

        # Add a nice tag so we can find them all again
        dataset['tags'].append({'name': 'HQIP' })
        print 'Owner org is', org['name']
        try:
            extras = []
            if 'temporal_coverage-from' in dataset:
                extras.append(dict(key='coverage_start_date', value=format_date(dataset['temporal_coverage-from'])))
            if 'temporal_coverage' in dataset:
                extras.append(dict(key='coverage_end_date', value=format_date(dataset['temporal_coverage-to'])))
            if 'frequency' in dataset:
                extras.append(dict(key='frequency', value=dataset['update_frequency']))

            new_dataset = Dataset.create_or_update(
                name=dataset['name'],
                title=dataset['title'],
                state='active',
                visibility='private',
                license_id='uk-ogl',
                notes=dataset['notes'],
                origin=dataset['url'],
                tags=dataset['tags'],
                resources=dataset['resources'],
                owner_org=org['name'],
                extras=extras
            )
            print "Created {}".format(dataset['name'])
        except ValueError as e:
            print 'skipping because error', e
            continue
        except ValidationError:
            raise
            print "Failed to upload {}".format(dataset['name'])