Exemplo n.º 1
0
def test_sub_sheet_list_like():
    # SubSheet object should be appendable and iterable...
    # .append() is used in json_input.py at https://github.com/OpenDataServices/flatten-tool/blob/master/flattentool/json_input.py#L33
    sub_sheet = Sheet()
    assert list(sub_sheet) == []
    sub_sheet.append('a')
    sub_sheet.append('b')
    assert list(sub_sheet) == ['a', 'b']
    # ... but also has an add_field method, which also appends
    sub_sheet.add_field('c')
    assert list(sub_sheet) == ['a', 'b', 'c']
    # but with the option to add an id_field, which appears at the start of the list
    sub_sheet.add_field('d', id_field=True)
    assert list(sub_sheet) == ['d', 'a', 'b', 'c']
def test_sub_sheet_list_like():
    # SubSheet object should be appendable and iterable...
    # .append() is used in json_input.py at https://github.com/OpenDataServices/flatten-tool/blob/master/flattentool/json_input.py#L33
    sub_sheet = Sheet()
    assert list(sub_sheet) == []
    sub_sheet.append('a')
    sub_sheet.append('b')
    assert list(sub_sheet) == ['a', 'b']
    # ... but also has an add_field method, which also appends
    sub_sheet.add_field('c')
    assert list(sub_sheet) == ['a', 'b', 'c']
    # but with the option to add an id_field, which appears at the start of the list
    sub_sheet.add_field('d', id_field=True)
    assert list(sub_sheet) == ['d', 'a', 'b', 'c']
Exemplo n.º 3
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""
    def __init__(self,
                 schema_filename=None,
                 root_schema_dict=None,
                 rollup=False,
                 root_id=None,
                 use_titles=False,
                 disable_local_refs=False,
                 truncation_length=3,
                 exclude_deprecated_fields=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.do_rollup = rollup
        self.rollup = set()
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                'One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                'Only one of schema_filename or root_schema_dict should be supplied'
            )
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            loader=JsonLoaderLocalRefsDisabled())
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(
                            os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin(
                            'file:',
                            urllib.pathname2url(
                                os.path.abspath(schema_filename)))
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            base_uri=base_uri)

        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict('', self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn('Field {} does not have a title, skipping.'.format(
                        field))
                else:
                    self.main_sheet.append(title)
                    self.main_sheet.titles[field] = title
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(self,
                          parent_path,
                          schema_dict,
                          parent_id_fields=None,
                          title_lookup=None,
                          parent_title=''):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if 'type' in schema_dict and schema_dict['type'] == 'array' \
                and 'items' in schema_dict and 'oneOf' in schema_dict['items']:
            for oneOf in schema_dict['items']['oneOf']:
                if 'type' in oneOf and oneOf['type'] == 'object':
                    for field, child_title in self.parse_schema_dict(
                            parent_path,
                            oneOf,
                            parent_id_fields=parent_id_fields,
                            title_lookup=title_lookup,
                            parent_title=parent_title):
                        yield (field, child_title)

        elif 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                if self.use_titles:
                    id_fields = parent_id_fields + [
                        (parent_title
                         if parent_title is not None else parent_path) +
                        (schema_dict['properties']['id'].get('title') or 'id')
                    ]
                else:
                    id_fields = parent_id_fields + [parent_path + 'id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    'properties'].items():
                if self.exclude_deprecated_fields and property_schema_dict.get(
                        'deprecated'):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ':'
                            if parent_title is not None and title else None):
                        yield (
                            property_name + '/' + field,
                            # TODO ambiguous use of "title"
                            (title + ':' +
                             child_title if title and child_title else None))

                elif 'array' in property_type_set:
                    flattened_key = parent_path.replace('/0/',
                                                        '/') + property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict['items'])
                    if 'string' in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif 'number' in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif 'array' in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(
                            property_schema_dict['items']['items'])
                        if 'string' in nested_type_set or 'number' in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        sub_sheet_name = make_sub_sheet_name(
                            parent_path,
                            property_name,
                            truncation_length=self.truncation_length)
                        #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(
                                field)] = field
                        fields = self.parse_schema_dict(
                            parent_path + property_name + '/0',
                            property_schema_dict['items'],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ':'
                            if parent_title is not None and title else None)

                        rollup_fields = set()
                        for field, child_title in fields:
                            full_path = parent_path + property_name + '/0/' + field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn(
                                        'Field {}{}/0/{} is missing a title, skipping.'
                                        .format(parent_path, property_name,
                                                field))
                                elif not title:
                                    warn(
                                        'Field {}{} does not have a title, skipping it and all its children.'
                                        .format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = parent_title + title + ':' + child_title
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if self.do_rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[
                                    'rollUp']:
                                rollup_fields.add(field)
                                self.rollup.add(full_path)
                                yield property_name + '/0/' + field, (
                                    title + ':' + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.do_rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(
                                property_schema_dict['rollUp']) - rollup_fields
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(
                                    ', '.join(missedRollUp)))

                    else:
                        raise ValueError(
                            'Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'
                            .format(type_set))
                elif 'string' in property_type_set or not property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "number"
                    yield property_name, title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "integer"
                    yield property_name, title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "boolean"
                    yield property_name, title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        'so this property has been ignored.'.format(
                            repr(property_type_set), property_name,
                            parent_path))

        else:
            warn('Skipping field "{}", because it has no properties.'.format(
                parent_path))
Exemplo n.º 4
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""
    def __init__(
        self,
        schema_filename=None,
        root_schema_dict=None,
        rollup=False,
        root_id=None,
        use_titles=False,
        disable_local_refs=False,
        truncation_length=3,
        exclude_deprecated_fields=False,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.do_rollup = rollup
        self.rollup = set()
        self.root_id = root_id
        self.use_titles = use_titles
        self.sub_sheet_titles = {}
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                _("One of schema_filename or root_schema_dict must be supplied"
                  ))
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                _("Only one of schema_filename or root_schema_dict should be supplied"
                  ))
        if schema_filename:
            if schema_filename.startswith("http"):
                import requests

                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            loader=JsonLoaderLocalRefsDisabled(),
                        )
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(
                            os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin(
                            "file:",
                            urllib.pathname2url(
                                os.path.abspath(schema_filename)),
                        )
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            base_uri=base_uri,
                        )

        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict("", self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn(
                        _("Field {} does not have a title, skipping.").format(
                            field))
                else:
                    self.main_sheet.append(title)
                    self.main_sheet.titles[field] = title
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(
        self,
        parent_path,
        schema_dict,
        parent_id_fields=None,
        title_lookup=None,
        parent_title="",
    ):
        if parent_path:
            parent_path = parent_path + "/"
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if ("type" in schema_dict and schema_dict["type"] == "array"
                and "items" in schema_dict
                and "oneOf" in schema_dict["items"]):
            for oneOf in schema_dict["items"]["oneOf"]:
                if "type" in oneOf and oneOf["type"] == "object":
                    for field, child_title in self.parse_schema_dict(
                            parent_path,
                            oneOf,
                            parent_id_fields=parent_id_fields,
                            title_lookup=title_lookup,
                            parent_title=parent_title,
                    ):
                        yield (field, child_title)

        elif "properties" in schema_dict:
            if "id" in schema_dict["properties"]:
                if self.use_titles:
                    id_fields = parent_id_fields + [
                        (parent_title
                         if parent_title is not None else parent_path) +
                        (schema_dict["properties"]["id"].get("title") or "id")
                    ]
                else:
                    id_fields = parent_id_fields + [parent_path + "id"]
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    "properties"].items():
                if self.exclude_deprecated_fields and property_schema_dict.get(
                        "deprecated"):
                    continue

                if (self.exclude_deprecated_fields
                        and hasattr(property_schema_dict, "__reference__") and
                        property_schema_dict.__reference__.get("deprecated")):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                if (hasattr(property_schema_dict, "__reference__")
                        and "title" in property_schema_dict.__reference__):
                    title = property_schema_dict.__reference__["title"]
                else:
                    title = property_schema_dict.get("title")
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if "object" in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                    ):
                        yield (
                            property_name + "/" + field,
                            # TODO ambiguous use of "title"
                            (title + ":" +
                             child_title if title and child_title else None),
                        )

                elif "array" in property_type_set:
                    flattened_key = parent_path.replace("/0/",
                                                        "/") + property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict["items"])
                    if "string" in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif "number" in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif "array" in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(
                            property_schema_dict["items"]["items"])
                        if "string" in nested_type_set or "number" in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif "object" in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        if self.use_titles and parent_title is not None:
                            sub_sheet_name = make_sub_sheet_name(
                                parent_title,
                                title or property_name,
                                truncation_length=self.truncation_length,
                                path_separator=":",
                            )
                            self.sub_sheet_titles[(
                                parent_path,
                                property_name,
                            )] = sub_sheet_name
                        else:
                            sub_sheet_name = make_sub_sheet_name(
                                parent_path,
                                property_name,
                                truncation_length=self.truncation_length,
                            )
                        # self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(
                                field)] = field
                        fields = self.parse_schema_dict(
                            parent_path + property_name + "/0",
                            property_schema_dict["items"],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                        )

                        rollup_fields = set()
                        for field, child_title in fields:
                            full_path = parent_path + property_name + "/0/" + field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn(
                                        _("Field {}{}/0/{} is missing a title, skipping."
                                          ).format(parent_path, property_name,
                                                   field))
                                elif not title:
                                    warn(
                                        _("Field {}{} does not have a title, skipping it and all its children."
                                          ).format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = (parent_title + title + ":" +
                                                  child_title)
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if (self.do_rollup
                                    and "rollUp" in property_schema_dict and
                                    field in property_schema_dict["rollUp"]):
                                rollup_fields.add(field)
                                self.rollup.add(full_path)
                                yield property_name + "/0/" + field, (
                                    title + ":" + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.do_rollup and "rollUp" in property_schema_dict:
                            missedRollUp = (
                                set(property_schema_dict["rollUp"]) -
                                rollup_fields)
                            if missedRollUp:
                                warn("{} in rollUp but not in schema".format(
                                    ", ".join(missedRollUp)))

                    else:
                        raise ValueError(
                            _('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'
                              ).format(type_set))
                elif "string" in property_type_set or not property_type_set:
                    # We only check for date here, because its the only format
                    # for which we need to specially transform the input
                    if property_schema_dict.get("format") == "date":
                        self.flattened[parent_path.replace("/0/", "/") +
                                       property_name] = "date"
                    else:
                        self.flattened[parent_path.replace("/0/", "/") +
                                       property_name] = "string"
                    yield property_name, title
                elif "number" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "number"
                    yield property_name, title
                elif "integer" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "integer"
                    yield property_name, title
                elif "boolean" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "boolean"
                    yield property_name, title
                else:
                    warn(
                        _('Unrecognised types {} for property "{}" with context "{}",'
                          "so this property has been ignored.").format(
                              repr(property_type_set), property_name,
                              parent_path))

        else:
            warn(
                _('Skipping field "{}", because it has no properties.').format(
                    parent_path))
Exemplo n.º 5
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""
    def __init__(self,
                 schema_filename=None,
                 root_schema_dict=None,
                 main_sheet_name='main',
                 rollup=False,
                 root_id='ocid',
                 use_titles=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.main_sheet_name = main_sheet_name
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles
        self.title_lookup = TitleLookup()
        self.flattened = {}

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                'One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                'Only one of schema_filename or root_schema_dict should be supplied'
            )
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                with codecs.open(schema_filename,
                                 encoding="utf-8") as schema_file:
                    self.root_schema_dict = jsonref.load(
                        schema_file, object_pairs_hook=OrderedDict)
        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict(self.main_sheet_name, '',
                                        self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn('Field {} does not have a title, skipping.'.format(
                        field))
                else:
                    self.main_sheet.append(title)
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(self,
                          parent_name,
                          parent_path,
                          schema_dict,
                          parent_id_fields=None,
                          title_lookup=None):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup
        if 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                id_fields = parent_id_fields + [parent_name + '/id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    'properties'].items():
                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_name + '/' + property_name,
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title)):
                        yield (
                            property_name + '/' + field,
                            # TODO ambiguous use of "title"
                            (title + ':' +
                             child_title if title and child_title else None))

                elif 'array' in property_type_set:
                    self.flattened[parent_path + property_name] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict['items'])
                    if 'string' in type_set:
                        self.flattened[parent_path +
                                       property_name] = "string_array"
                        yield property_name + ':array', title
                    elif 'array' in type_set:
                        self.flattened[parent_path +
                                       property_name] = "array_array"
                        if 'string' in get_property_type_set(
                                property_schema_dict['items']['items']):
                            yield property_name + ':array', title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[
                                title].property_name = property_name + '[]'
                        if hasattr(property_schema_dict['items'],
                                   '__reference__'):
                            sub_sheet_name = property_schema_dict[
                                'items'].__reference__['$ref'].split('/')[-1]
                        else:
                            sub_sheet_name = property_name

                        self.sub_sheet_mapping[parent_name + '/' +
                                               property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field + ':' + property_name,
                                                id_field=True)
                        fields = self.parse_schema_dict(
                            parent_name + '/' + property_name + '[]',
                            parent_path + property_name,
                            property_schema_dict['items'],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title))

                        rolledUp = set()

                        for field, child_title in fields:
                            if self.use_titles:
                                if not child_title:
                                    warn(
                                        'Field {} does not have a title, skipping.'
                                        .format(field))
                                else:
                                    sub_sheet.add_field(child_title)
                            else:
                                sub_sheet.add_field(field)
                            if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[
                                    'rollUp']:
                                rolledUp.add(field)
                                yield property_name + '[]/' + field, (
                                    title + ':' + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(
                                property_schema_dict['rollUp']) - rolledUp
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(
                                    ', '.join(missedRollUp)))
                    else:
                        raise ValueError
                elif 'string' in property_type_set:
                    self.flattened[parent_path + property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path + property_name] = "number"
                    yield property_name + ':number', title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path + property_name] = "integer"
                    yield property_name + ':integer', title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path + property_name] = "boolean"
                    yield property_name + ':boolean', title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        'so this property has been ignored.'.format(
                            repr(property_type_set), property_name,
                            parent_name))
        else:
            warn('Skipping field "{}", because it has no properties.'.format(
                parent_name))
Exemplo n.º 6
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""

    def __init__(
        self,
        schema_filename=None,
        root_schema_dict=None,
        main_sheet_name="main",
        rollup=False,
        root_id="ocid",
        use_titles=False,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.main_sheet_name = main_sheet_name
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles

        if root_schema_dict is None and schema_filename is None:
            raise ValueError("One of schema_filename or root_schema_dict must be supplied")
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError("Only one of schema_filename or root_schema_dict should be supplied")
        if schema_filename:
            if schema_filename.startswith("http"):
                import requests

                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict)
            else:
                with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                    self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict)
        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict(self.main_sheet_name, self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn("Field {} does not have a title, skipping.".format(field))
                else:
                    self.main_sheet.append(title)
            else:
                self.main_sheet.append(field)
            if title:
                self.main_sheet.titles[title] = field

    def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
        parent_id_fields = parent_id_fields or []
        if "properties" in schema_dict:
            if "id" in schema_dict["properties"]:
                id_fields = parent_id_fields + [parent_name + "/id"]
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict["properties"].items():
                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get("title")

                if "object" in property_type_set:
                    for field, child_title in self.parse_schema_dict(
                        parent_name + "/" + property_name, property_schema_dict, parent_id_fields=id_fields
                    ):
                        yield property_name + "/" + field, (
                            title + ":" + child_title if title and child_title else None
                        )  # TODO ambiguous use of "title"

                elif "array" in property_type_set:
                    type_set = get_property_type_set(property_schema_dict["items"])
                    if "string" in type_set:
                        yield property_name + ":array", title
                    elif "array" in type_set:
                        if "string" in get_property_type_set(property_schema_dict["items"]["items"]):
                            yield property_name + ":array", title
                        else:
                            raise ValueError
                    elif "object" in type_set:
                        if hasattr(property_schema_dict["items"], "__reference__"):
                            sub_sheet_name = property_schema_dict["items"].__reference__["$ref"].split("/")[-1]
                        else:
                            sub_sheet_name = property_name

                        self.sub_sheet_mapping[parent_name + "/" + property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]

                        for field in id_fields:
                            sub_sheet.add_field(field + ":" + property_name, id_field=True)
                        fields = self.parse_schema_dict(
                            parent_name + "/" + property_name + "[]",
                            property_schema_dict["items"],
                            parent_id_fields=id_fields,
                        )

                        rolledUp = set()

                        for field, child_title in fields:
                            if self.use_titles:
                                if not child_title:
                                    warn("Field {} does not have a title, skipping.".format(field))
                                else:
                                    sub_sheet.add_field(child_title)
                            else:
                                sub_sheet.add_field(field)
                            if child_title:
                                self.sub_sheets[sub_sheet_name].titles[child_title] = field
                            if (
                                self.rollup
                                and "rollUp" in property_schema_dict
                                and field in property_schema_dict["rollUp"]
                            ):
                                rolledUp.add(field)
                                yield property_name + "[]/" + field, (
                                    title + ":" + child_title if title and child_title else None
                                )

                        # Check that all items in rollUp are in the schema
                        if self.rollup and "rollUp" in property_schema_dict:
                            missedRollUp = set(property_schema_dict["rollUp"]) - rolledUp
                            if missedRollUp:
                                warn("{} in rollUp but not in schema".format(", ".join(missedRollUp)))
                    else:
                        raise ValueError
                elif "string" in property_type_set:
                    yield property_name, title
                elif "number" in property_type_set:
                    yield property_name + ":number", title
                elif "integer" in property_type_set:
                    yield property_name + ":integer", title
                elif "boolean" in property_type_set:
                    yield property_name + ":boolean", title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        "so this property has been ignored.".format(repr(property_type_set), property_name, parent_name)
                    )
        else:
            warn('Skipping field "{}", because it has no properties.'.format(parent_name))
Exemplo n.º 7
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""

    def __init__(self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False,
                 disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is  None:
            raise ValueError('One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError('Only one of schema_filename or root_schema_dict should be supplied')
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict,
                                                             loader=JsonLoaderLocalRefsDisabled())
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin('file:', urllib.pathname2url(os.path.abspath(schema_filename)))
                    with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict,
                                                             base_uri=base_uri)


        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict('', self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn('Field {} does not have a title, skipping.'.format(field))
                else:
                    self.main_sheet.append(title)
                    self.main_sheet.titles[field] = title
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title=''):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if 'type' in schema_dict and schema_dict['type'] == 'array' \
                and 'items' in schema_dict and 'oneOf' in schema_dict['items']:
            for oneOf in schema_dict['items']['oneOf']:
                if 'type' in oneOf and oneOf['type'] == 'object':
                    for field, child_title in self.parse_schema_dict(
                                parent_path,
                                oneOf,
                                parent_id_fields=parent_id_fields,
                                title_lookup=title_lookup,
                                parent_title=parent_title):
                            yield (
                                field,
                                child_title
                            )

        elif 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                if self.use_titles:
                    id_fields = parent_id_fields + [(parent_title if parent_title is not None else parent_path)+(schema_dict['properties']['id'].get('title') or 'id')]
                else:
                    id_fields = parent_id_fields + [parent_path+'id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict['properties'].items():
                if self.exclude_deprecated_fields and property_schema_dict.get('deprecated'):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path+property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path+property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title+title+':' if parent_title is not None and title else None):
                        yield (
                            property_name+'/'+field,
                            # TODO ambiguous use of "title"
                            (title+':'+child_title if title and child_title else None)
                        )

                elif 'array' in property_type_set:
                    flattened_key = parent_path.replace('/0/', '/')+property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(property_schema_dict['items'])
                    if 'string' in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif 'number' in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif 'array' in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(property_schema_dict['items']['items'])
                        if 'string' in nested_type_set or 'number' in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        sub_sheet_name = make_sub_sheet_name(parent_path, property_name,
                                                             truncation_length=self.truncation_length)
                        #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(field)] = field
                        fields = self.parse_schema_dict(
                                parent_path+property_name+'/0',
                                property_schema_dict['items'],
                                parent_id_fields=id_fields,
                                title_lookup=title_lookup.get(title),
                                parent_title=parent_title+title+':' if parent_title is not None and title else None)
                        rolledUp = set()

                        for field, child_title in fields:
                            full_path = parent_path+property_name+'/0/'+field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn('Field {}{}/0/{} is missing a title, skipping.'.format(parent_path, property_name, field))
                                elif not title:
                                    warn('Field {}{} does not have a title, skipping it and all its children.'.format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = parent_title+title+':'+child_title
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict['rollUp']:
                                rolledUp.add(field)
                                yield property_name+'/0/'+field, (title+':'+child_title if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(property_schema_dict['rollUp']) - rolledUp
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(', '.join(missedRollUp)))
                    else:
                        raise ValueError('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'.format(type_set))
                elif 'string' in property_type_set or not property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "number"
                    yield property_name, title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "integer"
                    yield property_name, title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "boolean"
                    yield property_name, title
                else:
                    warn('Unrecognised types {} for property "{}" with context "{}",'
                         'so this property has been ignored.'.format(
                             repr(property_type_set),
                             property_name,
                             parent_path))
        else:
            warn('Skipping field "{}", because it has no properties.'.format(parent_path))