Exemplo n.º 1
0
def publish_indicators(start_from=0):
    indicatorfile = DATA_DIR / 'indicators.json'
    logging.info('Loading {}'.format(indicatorfile))
    indicators = indicatorfile.json_load()
    logging.info('Processing {} indicators'.format(len(indicators)))
    logging.info('Starting from record {}'.format(start_from))
    for indicator in indicators[start_from:]:
        logging.info('Processing {}'.format(indicator['title']))
        logging.info('ID: {}'.format(indicator['unique identifier'].lower()))
        try:
            resources = [
                dict(description=s['description'],
                     name=s['url'].split('/')[-1],
                     format=s['filetype'],
                     upload=dc.fh_for_url(s['url']))
                for s in indicator['sources']
            ]
            dc.Dataset.create_or_update(
                name=indicator['unique identifier'].lower(),
                title=indicator['title'],
                state='active',
                licence_id='ogl',
                notes=indicator['definition'],
                url='https://indicators.ic.nhs.uk/webview/',
                tags=dc.tags(*indicator['keyword(s)']),
                resources=resources,
                owner_org='hscic')
        except Exception as ex:
            logging.error(ex)
    return
Exemplo n.º 2
0
def load_statistic(dataset, directory):
    if '2015' not in dataset['title']:
        print 'Skipping', dataset['title'].encode('utf8'), dataset['name'].encode('utf8')
        return
    print 'Creating', dataset['title'].encode('utf8'), dataset['name'].encode('utf8')
    try:
        extras = []
        if dataset.get('coverage_start_date', ''):
            extras.append(dict(key='coverage_start_date', value=dataset['coverage_start_date']))
        if dataset.get('coverage_end_date', ''):
            extras.append(dict(key='coverage_end_date', value=dataset['coverage_end_date']))
        if dataset.get('frequency', ''):
            extras.append(dict(key='frequency', value=dataset['frequency']))

        dc.Dataset.create_or_update(
            name=dataset['name'],
            title=dataset['title'],
            state='active',
            license_id='uk-ogl',
            notes=dataset['notes'],
            origin=dataset['origin'],
            tags=dc.tags(*dataset['tags']),
            resources=dataset["resources"],
            owner_org='nhs-england',
            extras=extras,
        )
        return True
    except Exception, e:
        print "ERROR: Problem updating/creating dataset - {}".format(dataset['name'])
        print e
Exemplo n.º 3
0
def publish_ods():
    """
    Do Useful Work Here
    """
    metadatafile = DATA_DIR/'ods.json'
    metadata = metadatafile.json_load()
    for dataset in metadata:
        resources = [
            dict(
                description=s['description'],
                name=s['url'].split('/')[-1],
                format=dc.filetype(s['url']),
                upload=dc.disk_fh_for_url(s['url'])          
            )
            for s in dataset['resources']
        ]
        print resources
        dc.Dataset.create_or_update(
            name=dataset['title'].lower().replace(' ', '-'),
            title=dataset['title'],
            state='active',
            licence_id='ogl',
            notes=dataset['description'],
            url='http://systems.hscic.gov.uk/data/ods',
            tags=dc.tags('ODS', 'Organisation', 'Organization'),
            resources=resources,
            owner_org='hscic-ods'
        )
        break
    return 
Exemplo n.º 4
0
def load_ascof():
    for directory, metadata_file, metadata in datasets():
        resources = [
            dict(description=r['description'],
                 name=r['url'].split('/')[-1],
                 format=r['format'],
                 url=r['url']
                 #upload=open(str(directory/r['url'].split('/')[-1]), 'r')
                 ) for r in metadata['resources']
        ]
        slug = slugify.slugify(metadata['title']).lower()
        print 'Creating', metadata['title'], slug
        dc.Dataset.create_or_update(
            name=slug,
            title=metadata['title'],
            state='active',
            license_id='uk-ogl',
            notes=metadata['summary'],
            origin=metadata['source'],
            tags=dc.tags(*metadata['tags']),
            resources=resources,
            owner_org='hscic',
            extras=[
                dict(key='coverage_start_date',
                     value=metadata['coverage_start_date']),
                dict(key='coverage_end_date',
                     value=metadata['coverage_end_date']),
                dict(key='frequency', value=metadata['frequency']),
                dict(key='publication_date',
                     value=metadata['publication_date'])
            ])
    return
def load_dataset(dataset, directory):
    print 'Creating', dataset['title'], dataset['name']
    try:
        extras = []
        if dataset.get('coverage_start_date', ''):
            extras.append(dict(key='coverage_start_date', value=dataset['coverage_start_date']))
        if dataset.get('coverage_end_date', ''):
            extras.append(dict(key='coverage_end_date', value=dataset['coverage_end_date']))
        if dataset.get('frequency', ''):
            extras.append(dict(key='frequency', value=dataset['frequency']))

        dc.Dataset.create_or_update(
            name=dataset['name'],
            title=dataset['title'],
            state='active',
            license_id='uk-ogl',
            notes=dataset['notes'],
            origin=dataset['origin'],
            tags=dc.tags(*dataset['tags']),
            resources=dataset["resources"],
            owner_org='hscic',
            extras=extras,
            coverage_start_date=dataset.get('coverage_start_date', ''),
            coverage_end_date=dataset.get('coverage_end_date', ''),
        )
    except Exception, e:
        print "ERROR: Problem updating/creating dataset - {}".format(dataset['name'])
        import traceback
        traceback.print_exc()
        print ".{}.{}.".format(dataset['coverage_start_date'], dataset['coverage_end_date'])
        sys.exit(0)
def publish_ods():
    """
    Do Useful Work Here
    """
    metadatafile = DATA_DIR / 'ods.json'
    metadata = metadatafile.json_load()
    for dataset in metadata:
        resources = [
            dict(description=s['description'],
                 name=s['url'].split('/')[-1],
                 format=dc.filetype(s['url']),
                 upload=dc.disk_fh_for_url(s['url']))
            for s in dataset['resources']
        ]
        print resources
        dc.Dataset.create_or_update(name=dataset['title'].lower().replace(
            ' ', '-'),
                                    title=dataset['title'],
                                    state='active',
                                    licence_id='ogl',
                                    notes=dataset['description'],
                                    url='http://systems.hscic.gov.uk/data/ods',
                                    tags=dc.tags('ODS', 'Organisation',
                                                 'Organization'),
                                    resources=resources,
                                    owner_org='hscic-ods')
        break
    return
Exemplo n.º 7
0
def load_dataset(dataset, directory):
    print 'Creating', dataset['title'].encode('utf8'), dataset['name'].encode(
        'utf8')
    try:
        extras = []
        if dataset.get('coverage_start_date', ''):
            extras.append(
                dict(key='coverage_start_date',
                     value=dataset['coverage_start_date']))
        if dataset.get('coverage_end_date', ''):
            extras.append(
                dict(key='coverage_end_date',
                     value=dataset['coverage_end_date']))

        dc.Dataset.create_or_update(name=dataset['name'],
                                    title=dataset['title'],
                                    state='active',
                                    license_id='uk-ogl',
                                    notes=dataset['notes'],
                                    origin=dataset['origin'],
                                    tags=dc.tags(*dataset['tags']),
                                    resources=dataset["resources"],
                                    owner_org='nhs-england',
                                    extras=extras,
                                    frequency='Annually')
        return True
    except Exception, e:
        print "ERROR: Problem updating/creating dataset - {}".format(
            dataset['name'])
        print e
Exemplo n.º 8
0
def load_pp():
    for directory, metadata_file, metadata in datasets():
        resources = [
            dict(
                description=r['description'],
                name=r['name'],
                format=r['filetype'].upper(),
                url=r['url'],
                url_type='',
            )
            for r in metadata['sources']
        ]
        slug = slugify.slugify(metadata['title']).lower()
        print 'Creating', metadata['title'], slug
        dc.Dataset.create_or_update(
            name=slug,
            title=metadata['title'],
            state='active',
            license_id='uk-ogl',
            notes=metadata['summary'],
            origin=metadata['source'],
            tags=dc.tags(*metadata['tags']),
            resources=resources,
            owner_org='hscic',
            frequency=metadata['frequency'],
            extras=[
                #dict(key='coverage_start_date', value=metadata['coverage_start_date']),
                #dict(key='coverage_end_date', value=metadata['coverage_end_date']),
                #dict(key='frequency', value=metadata['frequency']),
                dict(key='publication_date', value=metadata['publication_date'])
            ]
        )
    return
Exemplo n.º 9
0
def load_qof():
    for metadata in datasets():
        resources = [
            dict(
                description=r['description'],
                name=r['name'],
                format=r['format'],
                url=r['url']
            )
            for r in metadata['resources']
        ]
        print 'Creating', metadata['title'], "with {} resources".format(len(metadata['resources']))
        dc.Dataset.create_or_update(
            name=slugify.slugify(metadata['title']).lower(),
            title=metadata['title'],
            state='active',
            license_id='uk-ogl',
            notes=metadata['summary'],
            origin=metadata['source'],
            tags=dc.tags(*metadata['tags']),
            resources=resources,
            owner_org='hscic',
            frequency=metadata['frequency'],
            extras=[
                dict(key='coverage_start_date', value=metadata['coverage_start_date']),
                dict(key='coverage_end_date', value=metadata['coverage_end_date']),
                dict(key='publication_date', value=metadata['publication_date'])
            ]
        )
        print "... done"
    return
Exemplo n.º 10
0
def publish_choose_and_book():
    """
    Do Useful Work Here
    """
    for dataset in metadata:
        resources = [
            dict(
                description=s['description'],
                name=s['description'],
                format=s['filetype'],
                upload=dc.disk_fh_for_url(s['url'])
            )
            for s in dataset['resources']
        ]
        dc.Dataset.create_or_update(
            name=dataset['title'].lower().replace(' ', '-'),
            title=dataset['title'],
            state='active',
            licence_id='ogl',
            notes=dataset['description'],
            url='http://www.chooseandbook.nhs.uk/staff/bau/reports',
            tags=dc.tags(*dataset['tags']),
            resources=resources,
            owner_org='choose-and-book'
        )
    return 
Exemplo n.º 11
0
def load_ascof():
    for directory, metadata_file, metadata in datasets():
        resources = [
            dict(
                description=r['description'],
                name=r['url'].split('/')[-1],
                format=r['format'],
                url=r['url']
                #upload=open(str(directory/r['url'].split('/')[-1]), 'r')
            )
            for r in metadata['resources']
        ]
        slug = slugify.slugify(metadata['title']).lower()
        print 'Creating', metadata['title'], slug
        dc.Dataset.create_or_update(
            name=slug,
            title=metadata['title'],
            state='active',
            license_id='uk-ogl',
            notes=metadata['summary'],
            origin=metadata['source'],
            tags=dc.tags(*metadata['tags']),
            resources=resources,
            owner_org='hscic',
            extras=[
                dict(key='coverage_start_date', value=metadata['coverage_start_date']),
                dict(key='coverage_end_date', value=metadata['coverage_end_date']),
                dict(key='frequency', value=metadata['frequency']),
                dict(key='publication_date', value=metadata['publication_date'])
            ]
        )
    return
def publish_indicators(start_from=0):
    indicatorfile = DATA_DIR/'indicators.json'
    logging.info('Loading {}'.format(indicatorfile))
    indicators = indicatorfile.json_load()
    logging.info('Processing {} indicators'.format(len(indicators)))
    logging.info('Starting from record {}'.format(start_from))
    for indicator in indicators[start_from:]:
        logging.info('Processing {}'.format(indicator['title']))
        logging.info('ID: {}'.format(indicator['unique identifier'].lower()))
        try:
            resources = [
                dict(
                    description=s['description'],
                    name=s['url'].split('/')[-1],
                    format=s['filetype'],
                    upload=dc.fh_for_url(s['url'])
                )
                for s in indicator['sources']
            ]
            dc.Dataset.create_or_update(
                name=indicator['unique identifier'].lower(),
                title=indicator['title'],
                state='active',
                licence_id='ogl',
                notes=indicator['definition'],
                url='https://indicators.ic.nhs.uk/webview/',
                tags=dc.tags(*indicator['keyword(s)']),
                resources=resources,
                owner_org='hscic'
            )
        except Exception as ex:
            logging.error(ex)
    return
Exemplo n.º 13
0
def load_pp():
    for directory, metadata_file, metadata in datasets():
        resources = [
            dict(
                description=r['description'],
                name=r['name'],
                format=r['filetype'].upper(),
                url=r['url'],
                url_type='',
            ) for r in metadata['sources']
        ]
        slug = slugify.slugify(metadata['title']).lower()
        print 'Creating', metadata['title'], slug
        dc.Dataset.create_or_update(
            name=slug,
            title=metadata['title'],
            state='active',
            license_id='uk-ogl',
            notes=metadata['summary'],
            origin=metadata['source'],
            tags=dc.tags(*metadata['tags']),
            resources=resources,
            owner_org='hscic',
            frequency=metadata['frequency'],
            extras=[
                #dict(key='coverage_start_date', value=metadata['coverage_start_date']),
                #dict(key='coverage_end_date', value=metadata['coverage_end_date']),
                #dict(key='frequency', value=metadata['frequency']),
                dict(key='publication_date',
                     value=metadata['publication_date'])
            ])
    return
Exemplo n.º 14
0
def publish_datasets(start_from=0):
    datasetfile = DATA_DIR / 'datasets.json'
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    logging.info('Processing {} indicators'.format(len(datasets)))
    logging.info('Starting from record {}'.format(start_from))
    for dataset in datasets[start_from:]:
        logging.info('Processing {}'.format(dataset['title']))
        logging.info('ID: {}'.format(dataset['id']))
        try:
            resources = [
                dict(description=s['description'],
                     name=s['url'].split('/')[-1],
                     format=s['filetype'],
                     upload=dc.fh_for_url(s['url']))
                for s in dataset['sources']
            ]
            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts']
            name = 'hscic_dataset_{}'.format(dataset['id'])
            dc.Dataset.create_or_update(name=name,
                                        title=dataset['title'],
                                        state='active',
                                        licence_id='ogl',
                                        notes=notes,
                                        url=dataset['source'],
                                        tags=dc.tags(*dataset['keywords']),
                                        resources=resources,
                                        owner_org='hscic')
        except Exception as ex:
            logging.error(ex)
    return
Exemplo n.º 15
0
def load_nhsof(datasets):
    counter = 0

    # There are only 35 datasets from the scrape, why are we skipping 43.
    for metadata in datasets:  #[43:]:
        counter += 1
        resources = []
        for r in metadata['sources']:
            resources.append({
                'description': r['description'],
                'name': r['name'],
                'format': r['format'],
                'url': r['url'],
            })

        print "Resources ready for upload"
        metadata['title'] = 'NHSOF - ' + metadata['title']
        name = slugify.slugify(metadata['title']).lower()[:99]
        print u'Creating dataset: {}'.format(name)
        try:
            dc.Dataset.create_or_update(
                name=name,
                title=metadata['title'],
                state='active',
                license_id='uk-ogl',
                notes=metadata['description'],
                origin='https://indicators.ic.nhs.uk/webview/',
                tags=dc.tags(*metadata['keyword(s)']),
                resources=resources,
                #frequency=['Other', ],
                owner_org='hscic',
                extras=[
                    dict(key='frequency', value='Other'),
                    dict(key='coverage_start_date',
                         value=metadata['coverage_start_date']),
                    dict(key='coverage_end_date',
                         value=metadata['coverage_end_date']),
                    dict(key='domain', value=metadata['domain']),
                    dict(key='origin', value='HSCIC'),
                    dict(key='next_version_due',
                         value=metadata['next version due']),
                    dict(key='HSCIC_unique_id',
                         value=metadata['unique identifier']),
                    dict(key='homepage', value=metadata['homepage']),
                    dict(key='status', value=metadata['status']),
                    dict(key='language', value=metadata['language']),
                    dict(key='release_date',
                         value=metadata['current version uploaded'])
                ])
        except:
            print u"Failed to create {}".format(
                slugify.slugify(metadata['title']).lower()[:99])
    return counter
Exemplo n.º 16
0
def publish_indicators(start_from=0):
    indicatorfile = DATA_DIR/'indicators.json'
    logging.info('Loading {}'.format(indicatorfile))
    indicators = indicatorfile.json_load()
    amount = len(indicators)
    logging.info('Processing {} indicators'.format(amount))
    logging.info('Starting from record {}'.format(start_from))
    for indicator in indicators[start_from:10]:
        logging.info('{} of {}'.format(start_from, amount))
        start_from += 1
        try:
            logging.info('Processing {}'.format(indicator['title']))
            logging.info('ID: {}'.format(indicator['unique identifier'].lower()))
            resources = [
                dict(
                    description=s['description'],
                    name=s['url'].split('/')[-1],
                    format=s['filetype'],
                    upload=dc.fh_for_url(s['url'])
                )
                for s in indicator['sources']
            ]
            name = 'hscic_indicator_{}'.format(indicator['unique identifier'].lower())
            # Metadata specified by NHSEngland identified in comments...
            dc.Dataset.create_or_update(
                name=name, # Unique ID
                title=indicator['title'], #title
                notes=indicator['definition'], # description
                tags=dc.tags(*indicator['keyword(s)']), # tags
                extras=[
                    {'key': 'Public Access Level',
                     'value': 'Public',},
                    {'key': 'Data Quality Assurance',
                     'value': 'False'},
                    {'key': 'Release Date',
                     'value': indicator['current version uploaded'],},
                    {'key': 'Status',
                     'value': 'Live',},
                ],
                state='active',
                licence_id='ogl',
                url='https://indicators.ic.nhs.uk/webview/',
                resources=resources,
                groups=[
                    {'name': 'indicators'},
                ],
                owner_org='hscic' # publisher
            )
        except Exception as ex:
            logging.error(ex)
    return
Exemplo n.º 17
0
def load_nhsof(datasets):
    counter = 0

    # There are only 35 datasets from the scrape, why are we skipping 43.
    for metadata in datasets: #[43:]:
        counter += 1
        resources = []
        for r in metadata['sources']:
            resources.append({
                'description': r['description'],
                'name': r['name'],
                'format': r['format'],
                'url': r['url'],
            })

        print "Resources ready for upload"
        metadata['title'] = 'NHSOF - ' + metadata['title']
        name = slugify.slugify(metadata['title']).lower()[:99]
        print u'Creating dataset: {}'.format(name)
        try:
            dc.Dataset.create_or_update(
                name=name,
                title=metadata['title'],
                state='active',
                license_id='uk-ogl',
                notes=metadata['description'],
                origin='https://indicators.ic.nhs.uk/webview/',
                tags=dc.tags(*metadata['keyword(s)']),
                resources=resources,
                #frequency=['Other', ],
                owner_org='hscic',
                extras=[
                    dict(key='frequency', value='Other'),
                    dict(key='coverage_start_date', value=metadata['coverage_start_date']),
                    dict(key='coverage_end_date', value=metadata['coverage_end_date']),
                    dict(key='domain', value=metadata['domain']),
                    dict(key='origin', value='HSCIC'),
                    dict(key='next_version_due', value=metadata['next version due']),
                    dict(key='HSCIC_unique_id', value=metadata['unique identifier']),
                    dict(key='homepage', value=metadata['homepage']),
                    dict(key='status', value=metadata['status']),
                    dict(key='language', value=metadata['language']),
                    dict(key='release_date', value=metadata['current version uploaded'])

                ]
            )
        except:
            print u"Failed to create {}".format(slugify.slugify(metadata['title']).lower()[:99])
    return counter
Exemplo n.º 18
0
def load_ccgois(datasets):
    for metadata in datasets:
        resources = [
            dict(description=r['description'],
                 name=r['name'],
                 format=r['filetype'],
                 url=r['url']) for r in metadata['resources']
        ]

        print[r['name'] for r in metadata['resources']]

        metadata['title'] = u'CCGOIS - {}'.format(metadata['title'])
        metadata['name'] = make_name_from_title(metadata['title'])
        print u'Creating {}'.format(metadata['name'])
        dc.Dataset.create_or_update(
            name=metadata['name'],
            title=metadata['title'],
            state='active',
            license_id='uk-ogl',
            notes=metadata['description'],
            origin='https://indicators.ic.nhs.uk/webview/',
            tags=dc.tags(*metadata['keyword(s)']),
            resources=resources,
            #frequency=[metadata['frequency'], ],
            owner_org='hscic',
            extras=[
                dict(key='frequency', value=metadata.get('frequency', '')),
                dict(key='coverage_start_date',
                     value=metadata['coverage_start_date']),
                dict(key='coverage_end_date',
                     value=metadata['coverage_end_date']),
                dict(key='domain', value=metadata['domain']),
                dict(key='origin', value='HSCIC'),
                dict(key='next_version_due',
                     value=metadata['next version due']),
                dict(key='nhs_OF_indicators',
                     value=metadata['nhs_of_indicators']),
                dict(key='HSCIC_unique_id',
                     value=metadata['unique identifier']),
                dict(key='homepage', value=metadata['homepage']),
                dict(key='status', value=metadata['status']),
                dict(key='language', value=metadata['language']),
                dict(key='assurance_level', value=metadata['assurance_level']),
                dict(key='release_date',
                     value=metadata['current version uploaded'])
            ])
    return
Exemplo n.º 19
0
def load_phe():

    metadata = json.load(open(DATA_DIR / 'dataset.metadata.json'))

    resources = [
        dict(description=r['description'],
             name=r['name'],
             format=r['format'],
             url='url') for r in metadata['resources']
    ]

    extras = [
        dict(key='frequency', value=metadata['frequency']),
    ]
    if 'publication_date' in metadata:
        extras.append(
            dict(key='publication_date', value=metadata['publication_date']))
    extras.append(
        dict(key='coverage_start_date',
             value=metadata.get('coverage_start_date', '')))
    extras.append(
        dict(key='coverage_end_date', value=metadata.get('coverage_end_date')))

    print extras

    print 'Creating', metadata['title']
    dc.Dataset.create_or_update(
        name=slugify.slugify(metadata['title']).lower(),
        title=metadata['title'],
        state='active',
        license_id='uk-ogl',
        notes=metadata['summary'],
        origin=metadata['source'],
        tags=dc.tags(*metadata['tags']),
        resources=resources,
        owner_org='hscic',
        #extras=extras
    )
    return
Exemplo n.º 20
0
def load_phe():

    metadata = json.load(open(DATA_DIR/'dataset.metadata.json'))

    resources = [
        dict(
            description=r['description'],
            name=r['name'],
            format=r['format'],
            url='url'
        )
        for r in metadata['resources']
    ]

    extras = [
            dict(key='frequency', value=metadata['frequency']),
        ]
    if 'publication_date' in metadata:
        extras.append(dict(key='publication_date', value=metadata['publication_date']))
    extras.append(dict(key='coverage_start_date', value=metadata.get('coverage_start_date','')))
    extras.append(dict(key='coverage_end_date', value=metadata.get('coverage_end_date')))

    print extras

    print 'Creating', metadata['title']
    dc.Dataset.create_or_update(
        name=slugify.slugify(metadata['title']).lower(),
        title=metadata['title'],
        state='active',
        license_id='uk-ogl',
        notes=metadata['summary'],
        origin=metadata['source'],
        tags=dc.tags(*metadata['tags']),
        resources=resources,
        owner_org='hscic',
        #extras=extras
    )
    return
def publish_datasets(start_from=0):
    datasetfile = DATA_DIR/'datasets.json'
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    logging.info('Processing {} indicators'.format(len(datasets)))
    logging.info('Starting from record {}'.format(start_from))
    for dataset in datasets[start_from:]:
        logging.info('Processing {}'.format(dataset['title']))
        logging.info('ID: {}'.format(dataset['id']))
        try:
            resources = [
                dict(
                    description=s['description'],
                    name=s['url'].split('/')[-1],
                    format=s['filetype'],
                    upload=dc.fh_for_url(s['url'])
                )
                for s in dataset['sources']
            ]
            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts']
            name = 'hscic_dataset_{}'.format(dataset['id'])
            dc.Dataset.create_or_update(
                name=name,
                title=dataset['title'],
                state='active',
                licence_id='ogl',
                notes=notes,
                url=dataset['source'],
                tags=dc.tags(*dataset['keywords']),
                resources=resources,
                owner_org='hscic'
            )
        except Exception as ex:
            logging.error(ex)
    return
def load_ods():
    for directory, metadata_file, metadata in datasets():
        print 'Processing', metadata['title'], metadata['name']
        try:
            dc.Dataset.create_or_update(
                name=metadata['name'],
                title=metadata['title'],
                state='active',
                license_id='uk-ogl',
                notes=metadata['notes'],
                origin=metadata['origin'],
                tags=dc.tags(*metadata['tags']),
                resources=metadata["resources"],
                owner_org='hscic',
                frequency=metadata['frequency'],
                extras=[
                    dict(key='coverage_start_date', value=metadata['coverage_start_date']),
                    dict(key='coverage_end_date', value=metadata['coverage_end_date']),
                 #   dict(key='publication_date', value=metadata['publication_date'])
                ]
            )
        except:
            print "Failed to process", metadata['name']
    return
Exemplo n.º 23
0
def publish_datasets(start_from=0):
    datasetfile = DATA_DIR / 'datasets.json'
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    amount = len(datasets)
    logging.info('Processing {} datasets'.format(amount))
    logging.info('Starting from record {}'.format(start_from))
    for dataset in datasets[start_from:10]:
        try:
            logging.info('{} of {}'.format(start_from, amount))
            start_from += 1
            logging.info('Processing {}'.format(dataset['title']))
            logging.info('ID: {}'.format(dataset['id']))
            resources = [
                dict(description=s['description'],
                     name=s['url'].split('/')[-1],
                     format=s['filetype'],
                     upload=dc.fh_for_url(s['url']))
                for s in dataset['sources']
            ]
            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts']
            extras = [
                {
                    'key': 'Public Access Level',
                    'value': 'Public',
                },
                {
                    'key': 'Data Quality Assurance',
                    'value': 'False'
                },
                {
                    'key': 'Status',
                    'value': 'Live',
                },
            ]
            if 'date_range' in dataset:
                extras.append({
                    'key': 'Time period',
                    'value': dataset['date_range'],
                })
            if 'publication_date' in dataset:
                extras.append({
                    'key': 'Release date',
                    'value': dataset['publication_date'],
                })
            if 'geographical_coverage' in dataset:
                extras.append({
                    'key':
                    'Geographical coverage',
                    'value':
                    ', '.join(dataset['geographical_coverage'])
                })
            # groups
            groups = []
            for item in dataset['topics']:
                groups.append(item)
            for item in dataset['information_types']:
                groups.append(item)
            group_faff = []
            for g in groups:
                group_name = dc.ensure_group(g, 'HSCIC')
                group_faff.append({
                    'name': group_name,
                })
            name = 'hscic_dataset_{}'.format(dataset['id'])
            # NHSEngland metadata as comments...
            dc.Dataset.create_or_update(
                name=name,  # Unique ID
                title=dataset['title'],  # title
                notes=notes,  # description
                tags=dc.tags(*dataset['keywords']),  # tags
                extras=extras,
                state='active',
                licence_id='ogl',
                url=dataset['source'],
                resources=resources,
                groups=group_faff,
                owner_org='hscic'  # publisher
            )
        except Exception as ex:
            logging.error(ex)
    return
Exemplo n.º 24
0
def publish_indicators(start_from=0):
    indicatorfile = DATA_DIR / 'indicators.json'
    logging.info('Loading {}'.format(indicatorfile))
    indicators = indicatorfile.json_load()
    amount = len(indicators)
    logging.info('Processing {} indicators'.format(amount))
    logging.info('Starting from record {}'.format(start_from))
    for indicator in indicators[start_from:10]:
        logging.info('{} of {}'.format(start_from, amount))
        start_from += 1
        try:
            logging.info('Processing {}'.format(indicator['title']))
            logging.info('ID: {}'.format(
                indicator['unique identifier'].lower()))
            resources = [
                dict(description=s['description'],
                     name=s['url'].split('/')[-1],
                     format=s['filetype'],
                     upload=dc.fh_for_url(s['url']))
                for s in indicator['sources']
            ]
            name = 'hscic_indicator_{}'.format(
                indicator['unique identifier'].lower())
            # Metadata specified by NHSEngland identified in comments...
            dc.Dataset.create_or_update(
                name=name,  # Unique ID
                title=indicator['title'],  #title
                notes=indicator['definition'],  # description
                tags=dc.tags(*indicator['keyword(s)']),  # tags
                extras=[
                    {
                        'key': 'Public Access Level',
                        'value': 'Public',
                    },
                    {
                        'key': 'Data Quality Assurance',
                        'value': 'False'
                    },
                    {
                        'key': 'Release Date',
                        'value': indicator['current version uploaded'],
                    },
                    {
                        'key': 'Status',
                        'value': 'Live',
                    },
                ],
                state='active',
                licence_id='ogl',
                url='https://indicators.ic.nhs.uk/webview/',
                resources=resources,
                groups=[
                    {
                        'name': 'indicators'
                    },
                ],
                owner_org='hscic'  # publisher
            )
        except Exception as ex:
            logging.error(ex)
    return
Exemplo n.º 25
0
            for r in metadata['resources']
        ]

        print [r['name'] for r in metadata['resources']]

        metadata['title'] = u'CCGOIS - {}'.format(metadata['title'])
        metadata['name'] = make_name_from_title(metadata['title'])
        print u'Creating {}'.format(metadata['name'])
        dc.Dataset.create_or_update(
            name=metadata['name'],
            title=metadata['title'],
            state='active',
            license_id='uk-ogl',
            notes=metadata['description'],
            origin='https://indicators.ic.nhs.uk/webview/',
            tags=dc.tags(*metadata['keyword(s)']),
            resources=resources,
            #frequency=[metadata['frequency'], ],
            owner_org='hscic',
            extras=[
                dict(key='frequency', value=metadata.get('frequency', '')),
                dict(key='coverage_start_date', value=metadata['coverage_start_date']),
                dict(key='coverage_end_date', value=metadata['coverage_end_date']),
                dict(key='domain', value=metadata['domain']),
                dict(key='origin', value='HSCIC'),
                dict(key='next_version_due', value=metadata['next version due']),
                dict(key='nhs_OF_indicators', value=metadata['nhs_of_indicators']),
                dict(key='HSCIC_unique_id', value=metadata['unique identifier']),
                dict(key='homepage', value=metadata['homepage']),
                dict(key='status', value=metadata['status']),
                dict(key='language', value=metadata['language']),
Exemplo n.º 26
0
def publish_datasets(start_from=0):
    datasetfile = DATA_DIR/'datasets.json'
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    amount = len(datasets)
    logging.info('Processing {} datasets'.format(amount))
    logging.info('Starting from record {}'.format(start_from))
    for dataset in datasets[start_from:10]:
        try:
            logging.info('{} of {}'.format(start_from, amount))
            start_from += 1
            logging.info('Processing {}'.format(dataset['title']))
            logging.info('ID: {}'.format(dataset['id']))
            resources = [
                dict(
                    description=s['description'],
                    name=s['url'].split('/')[-1],
                    format=s['filetype'],
                    upload=dc.fh_for_url(s['url'])
                )
                for s in dataset['sources']
            ]
            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts']
            extras = [
                {'key': 'Public Access Level',
                 'value': 'Public',},
                {'key': 'Data Quality Assurance',
                 'value': 'False'},
                {'key': 'Status',
                 'value': 'Live',},
            ]
            if 'date_range' in dataset:
                extras.append({
                    'key': 'Time period',
                    'value': dataset['date_range'],
                })
            if 'publication_date' in dataset:
                extras.append({
                    'key': 'Release date',
                    'value': dataset['publication_date'],
                })
            if 'geographical_coverage' in dataset:
                extras.append({
                    'key': 'Geographical coverage',
                    'value': ', '.join(dataset['geographical_coverage'])
                })
            # groups
            groups = []
            for item in dataset['topics']:
                groups.append(item)
            for item in dataset['information_types']:
                groups.append(item)
            group_faff = []
            for g in groups:
                group_name = dc.ensure_group(g, 'HSCIC')
                group_faff.append({
                    'name': group_name,
                })
            name = 'hscic_dataset_{}'.format(dataset['id'])
            # NHSEngland metadata as comments...
            dc.Dataset.create_or_update(
                name=name, # Unique ID
                title=dataset['title'], # title
                notes=notes, # description
                tags=dc.tags(*dataset['keywords']), # tags
                extras=extras,
                state='active',
                licence_id='ogl',
                url=dataset['source'],
                resources=resources,
                groups=group_faff,
                owner_org='hscic' # publisher
            )
        except Exception as ex:
            logging.error(ex)
    return
def publish_datasets(start_from=0):
    global DATA_DIR

    u = Uploader("hscic-datasets")

    datasetfile = ffs.Path(get_resource_path('datasets.json'))
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    logging.info('Processing {} indicators'.format(len(datasets)))
    logging.info('Starting from record {}'.format(start_from))

    import random
    total = len(datasets) - start_from
    current = 1

    for dataset in datasets[start_from:]:
        print "STATUS: {}/{}".format(current, total)
        current += 1

        #print u'Processing {}'.format(dataset['title'])
        #print '  ID: {}'.format(dataset['id'])
        try:
            resources = []
            for s in dataset['sources']:
                resource = {
                    "description": s['description'],
                    "name": s['url'].split('/')[-1],
                    "format": s['filetype'],
                    "url": s["url"]
                }
                """
                filename = filename_for_resource(resource)
                path = DATA_DIR / filename
                download_file(resource['url'], path)
                resource['url'] = u.upload(path)
                """
                resources.append(resource)

            if not resources:
                print "Dataset {} does not have any resources".format(
                    dataset['id'])
                continue

            title = dataset['title']

            c = Curator(dataset)
            groups = c.get_groups()
            if not groups:
                print "Not in a group"
                continue

            prefix = c.get_title_prefix()
            if prefix:
                title = u"{} - {}".format(prefix, title)
            name = slugify.slugify(title).lower()[0:99]

            # Call cleantags on each work and expect back a list, which is then flattened

            tags = []
            if 'keywords' in dataset:
                dataset['keywords'] = sum([
                    clean_tag(k)
                    for k in dataset.get('keywords', []) if len(k) > 2
                ], [])
                tags = dc.tags(*dataset['keywords'])

            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join(
                    dataset['key_facts'])
            notes = to_markdown(notes)

            name = 'hscic_dataset_{}'.format(dataset['id'])

            dc.Dataset.create_or_update(name=name,
                                        title=title,
                                        state='active',
                                        licence_id='ogl',
                                        notes=notes,
                                        url=dataset['source'],
                                        tags=tags,
                                        resources=resources,
                                        owner_org='hscic')

            if groups:
                try:
                    dataset = dc.ckan.action.package_show(id=name)
                except:
                    continue

                for group in groups:
                    group = group.lower()

                    if [
                            g for g in dataset.get('groups', [])
                            if g['name'] == group
                    ]:
                        print 'Already in group', g['name']
                    else:
                        dc.ensure_group(group)
                        dc.ckan.action.member_create(id=group,
                                                     object=dataset['id'],
                                                     object_type='package',
                                                     capacity='member')
        except Exception as ex:
            import traceback
            traceback.print_exc()

    u.close()
    return
def publish_indicators(start_from=0):
    global DATA_DIR
    u = Uploader("hscic-indicators")

    indicatorfile = ffs.Path(get_resource_path('indicators.json'))
    logging.info('Loading {}'.format(indicatorfile))
    indicators = indicatorfile.json_load()
    logging.info('Processing {} indicators'.format(len(indicators)))
    logging.info('Starting from record {}'.format(start_from))
    for indicator in indicators[start_from:]:
        try:
            resources = []
            for s in indicator['sources']:
                resource = {
                    "description": s['description'],
                    "name": s['url'].split('/')[-1],
                    "format": s['filetype'].upper(),
                    "url": s["url"]
                }
                """
                filename = filename_for_resource(resource)
                path = DATA_DIR / filename
                download_file(resource['url'], path)
                print "Uploading to S3"
                url = u.upload(path)
                resource['url'] = url
                """
                resources.append(resource)

            if not 'indicators' in indicator['keyword(s)']:
                indicator['keyword(s)'].append('indicators')

            title = indicator['title']

            c = Curator(indicator)
            groups = c.get_groups()
            if not groups:
                print "Not in a group"
                continue

            prefix = c.get_title_prefix()
            if prefix:
                title = u"{} - {}".format(prefix, title)

            tags = []
            if 'keyword(s)' in dataset:
                dataset['keyword(s)'] = sum([
                    clean_tag(k)
                    for k in indicator.get('keyword(s)', []) if len(k) > 2
                ], [])
                tags = dc.tags(*dataset['keywords'])

            print '+ Create/Update dataset {}'.format(indicator['title'])
            dc.Dataset.create_or_update(
                name=slugify.slugify(title).lower()[:99],
                title=title,
                state='active',
                licence_id='ogl',
                notes=to_markdown(indicator['definition'].encode('utf8')),
                url='https://indicators.ic.nhs.uk/webview/',
                tags=dc.tags(tags),
                resources=resources,
                owner_org='hscic')

            if groups:
                try:
                    dataset = dc.ckan.action.package_show(
                        id=slugify.slugify(title)[:99].lower())
                except:
                    continue

                for group in groups:
                    group = group.lower()

                    if [
                            g for g in dataset.get('groups', [])
                            if g['name'] == group
                    ]:
                        print 'Already in group', g['name']
                    else:
                        dc.ckan.action.member_create(id=group,
                                                     object=dataset_name,
                                                     object_type='package',
                                                     capacity='member')

        except Exception as ex:
            import traceback
            traceback.print_exc()
            import sys
            sys.exit(1)

    u.close()
    return