示例#1
0
    def validator(self):
        # Validates that a URL is a good data.json file.
        if request.method == "POST" and "url" in request.POST and request.POST["url"].strip() != "":
            c.source_url = request.POST["url"]
            c.errors = []

            import urllib
            import json
            from datajsonvalidator import do_validation

            body = None
            try:
                body = json.load(urllib.urlopen(c.source_url))
            except IOError as e:
                c.errors.append(("Error Loading File", ["The address could not be loaded: " + unicode(e)]))
            except ValueError as e:
                c.errors.append(("Invalid JSON", ["The file does not meet basic JSON syntax requirements: " + unicode(
                    e) + ". Try using JSONLint.com."]))
            except Exception as e:
                c.errors.append((
                    "Internal Error",
                    ["Something bad happened while trying to load and parse the file: " + unicode(e)]))

            if body:
                try:
                    do_validation(body, c.errors)
                except Exception as e:
                    c.errors.append(("Internal Error", ["Something bad happened: " + unicode(e)]))
                if len(c.errors) == 0:
                    c.errors.append(("No Errors", ["Great job!"]))

        return render('datajsonvalidator.html')
示例#2
0
    def validate(pkg, dataset_dict):
        import sys, os

        global currentPackageOrg

        try:
            # When saved from UI DataQuality value is stored as "on" instead of True.
            # Check if value is "on" and replace it with True.
            dataset_dict = OrderedDict(dataset_dict)
            if dataset_dict.get('dataQuality') == "on" \
                    or dataset_dict.get('dataQuality') == "true" \
                    or dataset_dict.get('dataQuality') == "True":
                dataset_dict['dataQuality'] = True
            elif dataset_dict.get('dataQuality') == "false" \
                    or dataset_dict.get('dataQuality') == "False":
                dataset_dict['dataQuality'] = False

            errors = []
            try:
                from datajsonvalidator import do_validation
                do_validation([dict(dataset_dict)], errors,
                              Package2Pod.seen_identifiers)
            except Exception as e:
                errors.append(("Internal Error",
                               ["Something bad happened: " + unicode(e)]))
            if len(errors) > 0:
                for error in errors:
                    log.warn(error)

                try:
                    currentPackageOrg
                except NameError:
                    currentPackageOrg = 'unknown'

                errors_dict = OrderedDict([
                    ('id', pkg.get('id')),
                    ('name', Package2Pod.filter(pkg.get('name'))),
                    ('title', Package2Pod.filter(pkg.get('title'))),
                    ('organization', Package2Pod.filter(currentPackageOrg)),
                    ('errors', errors),
                ])

                return errors_dict

            return dataset_dict
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            log.error("%s : %s : %s", exc_type, filename, exc_tb.tb_lineno)
            raise e
示例#3
0
    def validate(pkg, dataset_dict):
        import sys, os

        global currentPackageOrg

        try:
            # When saved from UI DataQuality value is stored as "on" instead of True.
            # Check if value is "on" and replace it with True.
            dataset_dict = OrderedDict(dataset_dict)
            if dataset_dict.get('dataQuality') == "on" \
                    or dataset_dict.get('dataQuality') == "true" \
                    or dataset_dict.get('dataQuality') == "True":
                dataset_dict['dataQuality'] = True
            elif dataset_dict.get('dataQuality') == "false" \
                    or dataset_dict.get('dataQuality') == "False":
                dataset_dict['dataQuality'] = False

            errors = []
            try:
                from datajsonvalidator import do_validation
                do_validation([dict(dataset_dict)], errors, Package2Pod.seen_identifiers)
            except Exception as e:
                errors.append(("Internal Error", ["Something bad happened: " + unicode(e)]))
            if len(errors) > 0:
                for error in errors:
                    log.warn(error)

                try:
                    currentPackageOrg
                except NameError:
                    currentPackageOrg = 'unknown'

                errors_dict = OrderedDict([
                    ('id', pkg.get('id')),
                    ('name', Package2Pod.filter(pkg.get('name'))),
                    ('title', Package2Pod.filter(pkg.get('title'))),
                    ('organization', Package2Pod.filter(currentPackageOrg)),
                    ('errors', errors),
                ])

                return errors_dict

            return dataset_dict
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            log.error("%s : %s : %s", exc_type, filename, exc_tb.tb_lineno)
            raise e
    def make_datajson_export_entry(package, seen_identifiers):
        global currentPackageOrg
        currentPackageOrg = None
        # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict
        extras = dict([(x['key'], x['value']) for x in package['extras']])

        parent_dataset_id = extras.get('parent_dataset')
        if parent_dataset_id:
            parent = model.Package.get(parent_dataset_id)
            parent_uid = parent.extras.col.target['unique_id'].value
            if parent_uid:
                parent_dataset_id = parent_uid

        # if resource format is CSV then convert it to text/csv
        # Resource format has to be in 'csv' format for automatic datastore push.
        for r in package["resources"]:
            if r["format"].lower() == "csv":
                r["format"] = "text/csv"
            if r["format"].lower() == "json":
                r["format"] = "application/json"
            if r["format"].lower() == "pdf":
                r["format"] = "application/pdf"

        try:
            retlist = [
                ("@type", "dcat:Dataset"),  # optional

                ("title", JsonExportBuilder.strip_if_string(package["title"])),  # required

                # ("accessLevel", 'public'),  # required
                ("accessLevel", JsonExportBuilder.strip_if_string(extras.get('public_access_level'))),  # required

                # ("accrualPeriodicity", "R/P1Y"),  # optional
                # ('accrualPeriodicity', 'accrual_periodicity'),
                ('accrualPeriodicity', JsonExportBuilder.get_accrual_periodicity(extras.get('accrual_periodicity'))),
                # optional

                ("conformsTo", JsonExportBuilder.strip_if_string(extras.get('conforms_to'))),  # optional

                # ('contactPoint', OrderedDict([
                # ("@type", "vcard:Contact"),
                # ("fn", "Jane Doe"),
                # ("hasEmail", "mailto:[email protected]")
                # ])),  # required
                ('contactPoint', JsonExportBuilder.get_contact_point(extras)),  # required

                ("dataQuality", JsonExportBuilder.strip_if_string(extras.get('data_quality'))),
                # required-if-applicable

                ("describedBy", JsonExportBuilder.strip_if_string(extras.get('data_dictionary'))),  # optional
                ("describedByType", JsonExportBuilder.strip_if_string(extras.get('data_dictionary_type'))),  # optional

                ("description", JsonExportBuilder.strip_if_string(package["notes"])),  # required

                # ("description", 'asdfasdf'),  # required

                ("identifier", JsonExportBuilder.strip_if_string(extras.get('unique_id'))),  # required
                # ("identifier", 'asdfasdfasdf'),  # required

                ("isPartOf", parent_dataset_id),  # optional
                ("issued", JsonExportBuilder.strip_if_string(extras.get('release_date'))),  # optional

                # ("keyword", ['a', 'b']),  # required
                ("keyword", [t["display_name"] for t in package["tags"]]),  # required

                ("landingPage", JsonExportBuilder.strip_if_string(extras.get('homepage_url'))),  # optional

                ("license", JsonExportBuilder.strip_if_string(extras.get("license_new"))),  # required-if-applicable

                ("modified",
                 JsonExportBuilder.strip_if_string(extras.get("modified", package.get("metadata_modified")))),
                # required

                ("primaryITInvestmentUII", JsonExportBuilder.strip_if_string(extras.get('primary_it_investment_uii'))),
                # optional

                # ('publisher', OrderedDict([
                # ("@type", "org:Organization"),
                # ("name", "Widget Services")
                # ])),  # required
                # ("publisher", get_publisher_tree(extras)),  # required
                ("publisher", JsonExportBuilder.get_publisher_tree_wrong_order(extras)),  # required

                ("rights", JsonExportBuilder.strip_if_string(extras.get('access_level_comment'))),  # required

                ("spatial", JsonExportBuilder.strip_if_string(package.get("spatial"))),  # required-if-applicable

                ('systemOfRecords', JsonExportBuilder.strip_if_string(extras.get('system_of_records'))),  # optional

                ("temporal", JsonExportBuilder.strip_if_string(extras.get('temporal'))),  # required-if-applicable

                ("distribution", JsonExportBuilder.generate_distribution(package)),  # required-if-applicable

                # ("distribution",
                # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format)
                # [
                # OrderedDict([
                # ("downloadURL", r["url"]),
                # ("mediaType", r["formatReadable"]),
                # ])
                # for r in package["resources"]
                # ])
            ]

            for pair in [
                #('bureauCode', 'bureau_code'),  # required
                ('language', 'language'),  # optional
                #('programCode', 'program_code'),  # required
                ('references', 'related_documents'),  # optional
                ('theme', 'category'),  # optional
            ]:
                JsonExportBuilder.split_multiple_entries(retlist, extras, pair)

        except KeyError as e:
            log.warn("Missing Required Field for package with id=[%s], title=['%s'], organization=['%s']: '%s'" % (
                package.get('id'), package.get('title'), currentPackageOrg, e))

            errors = ['Missing Required Field', ["%s" % e]]
            errors_dict = OrderedDict([
                ('id', package.get('id')),
                ('name', package.get('name')),
                ('title', package.get('title')),
                ('organization', currentPackageOrg),
                ('errors', errors),
            ])

            return errors_dict

        # Remove entries where value is None, "", or empty list []
        striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []]

        # When saved from UI DataQuality value is stored as "on" instead of True.
        # Check if value is "on" and replace it with True.
        striped_retlist_dict = OrderedDict(striped_retlist)
        if striped_retlist_dict.get('dataQuality') == "on" \
                or striped_retlist_dict.get('dataQuality') == "true" \
                or striped_retlist_dict.get('dataQuality') == "True":
            striped_retlist_dict['dataQuality'] = True
        elif striped_retlist_dict.get('dataQuality') == "false" \
                or striped_retlist_dict.get('dataQuality') == "False":
            striped_retlist_dict['dataQuality'] = False

        errors = []
        try:
            do_validation([dict(striped_retlist_dict)], errors, seen_identifiers)
        except Exception as e:
            errors.append(("Internal Error", ["Something bad happened: " + unicode(e)]))
        if len(errors) > 0:
            for error in errors:
                log.warn(error)

            errors_dict = OrderedDict([
                ('id', package.get('id')),
                ('name', package.get('name')),
                ('title', package.get('title')),
                ('organization', currentPackageOrg),
                ('errors', errors),
            ])

            return errors_dict

        return striped_retlist_dict
    def make_datajson_export_entry(package):
        global currentPackageOrg
        currentPackageOrg = None
        # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict
        extras = dict([(x['key'], x['value']) for x in package['extras']])

        parent_dataset_id = extras.get('parent_dataset')
        if parent_dataset_id:
            parent = model.Package.get(parent_dataset_id)
            parent_uid = parent.extras.col.target['unique_id'].value
            if parent_uid:
                parent_dataset_id = parent_uid

        # if resource format is CSV then convert it to text/csv
        # Resource format has to be in 'csv' format for automatic datastore push.
        for r in package["resources"]:
            if r["format"].lower() == "csv":
                r["format"] = "text/csv"
            if r["format"].lower() == "json":
                r["format"] = "application/json"
            if r["format"].lower() == "pdf":
                r["format"] = "application/pdf"

        try:
            retlist = [
                ("@type", "dcat:Dataset"),  # optional

                ("title", JsonExportBuilder.strip_if_string(package["title"])),  # required

                # ("accessLevel", 'public'),  # required
                ("accessLevel", JsonExportBuilder.strip_if_string(extras.get('public_access_level'))),  # required

                # ("accrualPeriodicity", "R/P1Y"),  # optional
                # ('accrualPeriodicity', 'accrual_periodicity'),
                ('accrualPeriodicity', JsonExportBuilder.get_accrual_periodicity(extras.get('accrual_periodicity'))),
                # optional

                ("conformsTo", JsonExportBuilder.strip_if_string(extras.get('conforms_to'))),  # optional

                # ('contactPoint', OrderedDict([
                # ("@type", "vcard:Contact"),
                # ("fn", "Jane Doe"),
                # ("hasEmail", "mailto:[email protected]")
                # ])),  # required
                ('contactPoint', JsonExportBuilder.get_contact_point(extras)),  # required

                ("dataQuality", JsonExportBuilder.strip_if_string(extras.get('data_quality'))),
                # required-if-applicable

                ("describedBy", JsonExportBuilder.strip_if_string(extras.get('data_dictionary'))),  # optional
                ("describedByType", JsonExportBuilder.strip_if_string(extras.get('data_dictionary_type'))),  # optional

                ("description", JsonExportBuilder.strip_if_string(package["notes"])),  # required

                # ("description", 'asdfasdf'),  # required

                ("identifier", JsonExportBuilder.strip_if_string(extras.get('unique_id'))),  # required
                # ("identifier", 'asdfasdfasdf'),  # required

                ("isPartOf", parent_dataset_id),  # optional
                ("issued", JsonExportBuilder.strip_if_string(extras.get('release_date'))),  # optional

                # ("keyword", ['a', 'b']),  # required
                ("keyword", [t["display_name"] for t in package["tags"]]),  # required

                ("landingPage", JsonExportBuilder.strip_if_string(extras.get('homepage_url'))),  # optional

                ("license", JsonExportBuilder.strip_if_string(extras.get("license_new"))),  # required-if-applicable

                ("modified",
                 JsonExportBuilder.strip_if_string(extras.get("modified", package.get("metadata_modified")))),
                # required

                ("primaryITInvestmentUII", JsonExportBuilder.strip_if_string(extras.get('primary_it_investment_uii'))),
                # optional

                # ('publisher', OrderedDict([
                # ("@type", "org:Organization"),
                # ("name", "Widget Services")
                # ])),  # required
                # ("publisher", get_publisher_tree(extras)),  # required
                ("publisher", JsonExportBuilder.get_publisher_tree_wrong_order(extras)),  # required

                ("rights", JsonExportBuilder.strip_if_string(extras.get('access_level_comment'))),  # required

                ("spatial", JsonExportBuilder.strip_if_string(package.get("spatial"))),  # required-if-applicable

                ('systemOfRecords', JsonExportBuilder.strip_if_string(extras.get('system_of_records'))),  # optional

                ("temporal", JsonExportBuilder.strip_if_string(extras.get('temporal'))),  # required-if-applicable

                ("distribution", JsonExportBuilder.generate_distribution(package)),  # required-if-applicable

                # ("distribution",
                # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format)
                # [
                # OrderedDict([
                # ("downloadURL", r["url"]),
                # ("mediaType", r["formatReadable"]),
                # ])
                # for r in package["resources"]
                # ])
            ]

            for pair in [
                ('bureauCode', 'bureau_code'),  # required
                ('language', 'language'),  # optional
                ('programCode', 'program_code'),  # required
                ('references', 'related_documents'),  # optional
                ('theme', 'category'),  # optional
            ]:
                JsonExportBuilder.split_multiple_entries(retlist, extras, pair)

        except KeyError as e:
            log.warn("Missing Required Field for package with id=[%s], title=['%s'], organization=['%s']: '%s'" % (
                package.get('id'), package.get('title'), currentPackageOrg, e))

            errors = ['Missing Required Field', ["%s" % e]]
            errors_dict = OrderedDict([
                ('id', package.get('id')),
                ('name', package.get('name')),
                ('title', package.get('title')),
                ('organization', currentPackageOrg),
                ('errors', errors),
            ])

            return errors_dict

        # Remove entries where value is None, "", or empty list []
        striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []]

        # When saved from UI DataQuality value is stored as "on" instead of True.
        # Check if value is "on" and replace it with True.
        striped_retlist_dict = OrderedDict(striped_retlist)
        if striped_retlist_dict.get('dataQuality') == "on" \
                or striped_retlist_dict.get('dataQuality') == "true" \
                or striped_retlist_dict.get('dataQuality') == "True":
            striped_retlist_dict['dataQuality'] = True
        elif striped_retlist_dict.get('dataQuality') == "false" \
                or striped_retlist_dict.get('dataQuality') == "False":
            striped_retlist_dict['dataQuality'] = False

        from datajsonvalidator import do_validation

        errors = []
        try:
            do_validation([dict(striped_retlist_dict)], errors)
        except Exception as e:
            errors.append(("Internal Error", ["Something bad happened: " + unicode(e)]))
        if len(errors) > 0:
            for error in errors:
                log.warn(error)

            errors_dict = OrderedDict([
                ('id', package.get('id')),
                ('name', package.get('name')),
                ('title', package.get('title')),
                ('organization', currentPackageOrg),
                ('errors', errors),
            ])

            return errors_dict

        return striped_retlist_dict
def make_datajson_entry(package):
    # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict
    extras = dict([(x['key'], x['value']) for x in package['extras']])

    parent_dataset_id = extras.get('parent_dataset')
    if parent_dataset_id:
        parent = model.Package.get(parent_dataset_id)
        parent_uid = parent.extras.col.target['unique_id'].value
        if parent_uid:
            parent_dataset_id = parent_uid

    # if resource format is CSV then convert it to text/csv
    # Resource format has to be in 'csv' format for automatic datastore push.
    for r in package["resources"]:
        if r["format"].lower() == "csv":
            r["format"] = "text/csv"
        if r["format"].lower() == "json":
            r["format"] = "application/json"
        if r["format"].lower() == "pdf":
            r["format"] = "application/pdf"

    try:
        retlist = [
            ("@type", "dcat:Dataset"),  # optional

            ("title", strip_if_string(package["title"])),  # required

            # ("accessLevel", 'public'),  # required
            ("accessLevel", strip_if_string(extras.get('public_access_level'))),  # required

            # ("accrualPeriodicity", "R/P1Y"),  # optional
            # ('accrualPeriodicity', 'accrual_periodicity'),
            ('accrualPeriodicity', get_accrual_periodicity(extras.get('accrual_periodicity'))), # optional

            ("conformsTo", strip_if_string(extras.get('conforms_to'))),  # optional

            # ('contactPoint', OrderedDict([
            # ("@type", "vcard:Contact"),
            # ("fn", "Jane Doe"),
            # ("hasEmail", "mailto:[email protected]")
            # ])),  # required
            ('contactPoint', get_contact_point(extras, package)),  # required

            ("dataQuality", strip_if_string(extras.get('data_quality'))),  # required-if-applicable

            ("describedBy", strip_if_string(extras.get('data_dictionary'))),  # optional
            ("describedByType", strip_if_string(extras.get('data_dictionary_type'))),  # optional

            ("description", strip_if_string(package["notes"])),  # required

            # ("description", 'asdfasdf'),  # required

            ("identifier", strip_if_string(extras.get('unique_id'))),  # required
            # ("identifier", 'asdfasdfasdf'),  # required

            ("isPartOf", parent_dataset_id),  # optional
            ("issued", strip_if_string(extras.get('release_date'))),  # optional

            # ("keyword", ['a', 'b']),  # required
            ("keyword", [t["display_name"] for t in package["tags"]]),  # required

            ("landingPage", strip_if_string(extras.get('homepage_url'))),   # optional

            ("license", strip_if_string(extras.get("license_new"))),    # required-if-applicable

            ("modified", strip_if_string(extras.get("modified"))),  # required

            ("primaryITInvestmentUII", strip_if_string(extras.get('primary_it_investment_uii'))),  # optional

            # ('publisher', OrderedDict([
            # ("@type", "org:Organization"),
            # ("name", "Widget Services")
            # ])),  # required
            ("publisher", get_publisher_tree(extras)),  # required

            ("rights", strip_if_string(extras.get('access_level_comment'))),  # required

            ("spatial", strip_if_string(package.get("spatial"))),  # required-if-applicable

            ('systemOfRecords', strip_if_string(extras.get('system_of_records'))),  # optional

            ("temporal", strip_if_string(extras.get('temporal'))),  # required-if-applicable

            ("distribution", generate_distribution(package)),   # required-if-applicable

            # ("distribution",
            # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format)
            # [
            # OrderedDict([
            # ("downloadURL", r["url"]),
            # ("mediaType", r["formatReadable"]),
            # ])
            #      for r in package["resources"]
            #  ])
        ]

        for pair in [
            ('bureauCode', 'bureau_code'),  # required
            ('language', 'language'),   # optional
            ('programCode', 'program_code'),    # required
            ('references', 'related_documents'),    # optional
            ('theme', 'category'),  # optional
        ]:
            split_multiple_entries(retlist, extras, pair)

    except KeyError as e:
        log.warn("Invalid field detected for package with id=[%s], title=['%s']: '%s'", package.get('id'),
                 package.get('title'), e)
        return

    # # TODO this is a lazy hack to make sure we don't have redundant fields when the free form key/value pairs are added
    # extras_to_filter_out = ['publisher', 'contact_name', 'contact_email', 'unique_id', 'public_access_level',
    # 'data_dictionary', 'bureau_code', 'program_code', 'access_level_comment', 'license_title',
    # 'spatial', 'temporal', 'release_date', 'accrual_periodicity', 'language', 'granularity',
    # 'data_quality', 'size', 'homepage_url', 'rss_feed', 'category', 'related_documents',
    # 'system_of_records', 'system_of_records_none_related_to_this_dataset', 'tags',
    # 'extrasRollup', 'format', 'accessURL', 'notes', 'publisher_1', 'publisher_2', 'publisher_3',
    # 'publisher_4', 'publisher_5']
    #
    # # Append any free extras (key/value pairs) that aren't part of common core but have been associated with the dataset
    # # TODO really hackey, short on time, had to hardcode a lot of the names to remove. there's much better ways, maybe
    # # generate a list of keys to ignore by calling a specific function to get the extras
    # retlist_keys = [x for x, y in retlist]
    # extras_keys = set(extras.keys()) - set(extras_to_filter_out)
    #
    # for key in extras_keys:
    # convertedKey = underscore_to_camelcase(key)
    # if convertedKey not in retlist_keys:
    # retlist.append((convertedKey, extras[key]))

    # Remove entries where value is None, "", or empty list []
    striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []]
    striped_retlist_keys = [x for x, y in striped_retlist]


    # If a required metadata field was removed, return empty string
    # for required_field in ["accessLevel", "bureauCode", "contactPoint", "description", "identifier", "keyword",
    #                        "modified", "programCode", "publisher", "title"]:
    #     if required_field not in striped_retlist_keys:
    #         log.warn("Missing required field detected for package with id=[%s], title=['%s']: '%s'",
    #                  package.get('id'), package.get('title'), required_field)
    #         return

    # When saved from UI DataQuality value is stored as "on" instead of True.
    # Check if value is "on" and replace it with True.
    striped_retlist_dict = OrderedDict(striped_retlist)
    if striped_retlist_dict.get('dataQuality') == "on" \
            or striped_retlist_dict.get('dataQuality') == "true" \
            or striped_retlist_dict.get('dataQuality') == "True":
        striped_retlist_dict['dataQuality'] = True
    elif striped_retlist_dict.get('dataQuality') == "false" \
            or striped_retlist_dict.get('dataQuality') == "False":
        striped_retlist_dict['dataQuality'] = False

    from datajsonvalidator import do_validation
    errors = []
    try:
        do_validation([dict(striped_retlist_dict)], errors)
    except Exception as e:
        errors.append(("Internal Error", ["Something bad happened: " + unicode(e)]))
    if len(errors) > 0:
        for error in errors:
            log.warn(error)
        return

    return striped_retlist_dict