示例#1
0
def read_xml(xml_file, graph=None):
    """
    Parse hierarchical XML data structure to a graph
    
    Uses the Python build-in etree cElementTree parser to parse the XML
    document and convert the elements into nodes.
    The XML element tag becomes the node key, XML text becomes the node
    value and XML attributes are added to the node as additional attributes.
    
    :param xml_file:       XML data to parse
    :type xml_file:        File, string, stream or URL
    :param graph:          Graph object to import dictionary data in
    :type graph:           :graphit:Graph
    
    :return:               GraphAxis object
    :rtype:                :graphit:GraphAxis
    """

    # User defined or default GraphAxis object
    if graph is None:
        graph = GraphAxis()
    if not isinstance(graph, GraphAxis):
        raise TypeError('Unsupported graph type {0}'.format(type(graph)))

    # Try parsing the string using default Python cElementTree parser
    xml_file = open_anything(xml_file)
    try:
        tree = et.fromstring(xml_file.read())
    except et.ParseError as error:
        logging.error(
            'Unable to parse XML file. cElementTree error: {0}'.format(error))
        return

    def walk_element_tree(element, parent=None):

        for child in element:
            child_data = child.attrib
            if child.text and len(child.text.strip()):
                child_data[graph.value_tag] = child.text.strip()

            nid = graph.add_node(child.tag, **child_data)
            graph.add_edge(parent, nid)

            walk_element_tree(child, parent=nid)

    is_empty = graph.empty()

    # Add root element
    element_data = tree.attrib
    if tree.text and len(tree.text.strip()):
        element_data[graph.value_tag] = tree.text.strip()
    rid = graph.add_node(tree.tag, **element_data)

    if is_empty:
        graph.root = rid

    # Recursive add XML elements as nodes
    walk_element_tree(tree, parent=graph.root)

    return graph
示例#2
0
def read_pydata(data, graph=None, parser_classes=None, level=0):
    """
    Parse (hierarchical) python data structures to a graph

    Many data formats are first parsed to a python structure before they are
    converted to a graph using the `read_pydata` function.
    The function supports any object that is an instance of, or behaves as, a
    Python dictionary, list, tuple or set and converts these (nested)
    structures to graph nodes and edges for connectivity. Data is stored in
    nodes using the node and edge 'key_tag' and 'value_tag' attributes in the
    Graph class.

    Data type and format information are also stored as part of the nodes to
    enable reconstruction of the Python data structure on export using the
    `write_pydata` function. Changing type and format on a node or edge
    allows for customized data export.

    Parsing of data structures to nodes and edges is handled by parser classes
    that need to define the methods `deserialize` for reading and `serialize`
    for writing. In `write_pydata` these classes are registered with the ORM
    to fully customize the use of the `serialize` method. In the `read_pydata`
    function the ORM cannot be used because the nodes/edges themselves do not
    yet exist. Instead they are provided as a dictionary through the
    `parser_classes` argument. The dictionary defines the string representation
    of the Python data type as key and parser class as value.

    Parser customization is important as Python data structures can be
    represented as a graph structure in different ways. This is certainly true
    for dictionaries where key/value pairs can be part of the node attributes,
    as separate nodes or as a combination of the two.
    `read_pydata` has quick support for two scenario's using the `level`
    argument:

        * level 0: every dictionary key/value pair is represented as a node
          regardless of its position in the nested data structure
        * level 1: all keys at the same level in the hierarchy that have a
          primitive type value are stored as part of the node attributes.

    If the `graph` is empty, the first node added to the graph is assigned
    as root node. If the `graph` is not empty, new nodes and edges will be
    added to it as subgraph. Edge connections between the two will have to be
    made afterwards.

    :param data:            Python (hierarchical) data structure
    :param graph:           GraphAxis object to import dictionary data in
    :type graph:            :graphit:GraphAxis
    :param parser_classes:  parser class definition for different Python data
                            types. Updates default classes for level 0 or 1
    :type parser_classes:   :py:dict
    :param level:           dictionary parsing mode
    :type level:            :py:int

    :return:                GraphAxis object
    :rtype:                 :graphit:GraphAxis
    """

    # User defined or default GraphAxis object
    if graph is None:
        graph = GraphAxis()
    elif not isinstance(graph, GraphAxis):
        raise GraphitException('Unsupported graph type {0}'.format(type(graph)))

    # Determine parser classes to use based on level
    assert level in (0, 1), GraphitException('Unsupported level {0}. Required to be 0 or 1'.format(level))
    if level == 0:
        parser_class_dict = copy.copy(ORMDEFS_LEVEL0)
    else:
        parser_class_dict = copy.copy(ORMDEFS_LEVEL1)

    # Update parser_class_dict with custom classes if any
    if isinstance(parser_classes, dict):
        parser_class_dict.update(parser_classes)

    # Define root
    if graph.empty():
        graph.root = graph.data.nodeid

    # Start recursive parsing by calling the `deserialize` method on the parser object
    parser = parser_class_dict.get(return_instance_type(data), parser_class_dict['fallback'])
    p = parser()
    p.deserialize(data, graph, parser_class_dict)

    return graph
def read_json_schema(schema, graph=None, exclude_args=None, resolve_ref=True):
    """
    Import hierarchical data structures defined in a JSON schema format

    :param schema:            JSON Schema data format to import
    :type schema:             dict, file, string, stream or URL
    :param graph:             graph object to import TGF data in
    :type graph:              :graphit:Graph
    :param exclude_args:      JSON schema arguments to exclude from import
    :type exclude_args:       :py:list
    :param resolve_ref:       Parse JSON schema 'definitions'
    :type resolve_ref:        :py:bool

    :return:                  Graph object
    :rtype:                   :graphit:Graph
    """

    json_schema = schema
    if not isinstance(schema, dict):

        # Try parsing the string using default Python json parser
        json_schema = open_anything(schema)
        try:
            json_schema = json.load(json_schema)
        except (IOError, ValueError) as error:
            logger.error('Unable to decode JSON string: {0}'.format(error))
            return

    # User defined or default Graph object
    if graph is None:
        graph = GraphAxis()
    elif not isinstance(graph, GraphAxis):
        raise GraphitException('Unsupported graph type {0}'.format(
            type(graph)))

    if graph.empty():
        rid = graph.add_node('root')
        graph.root = rid

    # Build JSON schema parser ORM with format specific conversion classes
    graph.node_tools = JSONSchemaValidatorDraft07
    graph.orm = JSONSchemaORMDraft07

    # What data-blocks to parse, properties by default, definitions if required
    datablock = ['properties']
    if resolve_ref:
        datablock.append('definitions')

    if exclude_args is None:
        exclude_args = []

    def walk_schema(schema_block, parent=None):

        # Get all JSON schema definitions for this data instance
        attributes = dict([(k, v) for k, v in schema_block.items()
                           if not isinstance(v, dict) and k not in exclude_args
                           ])
        node = graph.getnodes(parent)
        node.update(attributes)

        # Get 'required' attribute
        required = schema_block.get('required', [])
        if not isinstance(required, list):
            required = []

        # Store default data or None
        if attributes.get('default') is not None:
            node.set(graph.data.value_tag, attributes.get('default'))

        # For all child elements in datablock, make new node
        # and parse using recursive calls to parse_schema
        for block in schema_block.keys():
            if block in datablock:
                for child, attr in schema_block[block].items():
                    nid = graph.add_node(child)

                    # Register block_name in child attributes
                    attr['schema_label'] = block

                    # Register 'required' elements
                    if child in required:
                        attr['required'] = True

                    graph.add_edge(parent, nid)
                    walk_schema(attr, parent=nid)

    walk_schema(json_schema, graph.root)

    # Parse schema meta data
    document_path = ''
    if isinstance(schema, PY_STRING):
        document_path = os.path.abspath(schema)

    root = graph.get_root()
    root.set('document_path', document_path)
    parse_schema_meta_data(root)

    # Resolve JSON Schema $ref
    if resolve_ref:
        resolve_json_ref(graph, exclude_args=exclude_args)

    return graph
示例#4
0
def read_web(web,
             graph=None,
             orm_data_tag='haddock_type',
             auto_parse_format=True):
    """
    Import hierarchical data structures defined in the Spider .web format

    The data block type identifiers used in the .web format are stored in
    the nodes using the `orm_data_tag` attribute. These can be used by the
    Graph ORM mapper for custom data exchange in the graph.

    :param web:               Spider .web data format to import
    :type web:                file, string, stream or URL
    :param graph:             graph object to import TGF data in
    :type graph:              :graphit:Graph
    :param orm_data_tag:      data key to use for .web data identifier
    :type orm_data_tag:       :py:str
    :param auto_parse_format: automatically detect basic format types using JSON decoding
    :type auto_parse_format:  :py:bool

    :return:                  Graph object
    :rtype:                   :graphit:Graph
    """

    web_file = open_anything(web)
    if graph is None:
        graph = GraphAxis()
    elif not isinstance(graph, GraphAxis):
        raise GraphitException('Unsupported graph type {0}'.format(
            type(graph)))

    # Build .web parser ORM with format specific conversion classes
    weborm = GraphORM()
    weborm.node_mapping.add(
        RestraintsInterface,
        lambda x: x.get(graph.data.key_tag) == 'activereslist')
    weborm.node_mapping.add(
        RestraintsInterface,
        lambda x: x.get(graph.data.key_tag) == 'passivereslist')

    # Set current ORM aside and register parser ORM.
    curr_orm = graph.orm
    graph.orm = weborm

    curr_obj_nid = None
    object_open_tags = 0
    object_close_tags = 0
    array_key_counter = 1
    array_store = []
    for i, line in enumerate(web_file.readlines()):
        line = line.strip()
        if len(line):

            # Detect start of new object definition
            if line.endswith('('):

                # Process data
                meta_data = [n.strip() for n in line.strip('(').split('=', 1)]
                ddict = {orm_data_tag: meta_data[-1], 'is_array': False}
                if len(meta_data) > 1:
                    node_key = meta_data[0]
                else:
                    node_key = 'item{0}'.format(array_key_counter)
                    ddict['is_array'] = True
                    array_key_counter += 1

                # Clear the array store
                array_store = []

                # First object defines graph root
                if graph.empty():
                    curr_obj_nid = graph.add_node(node_key, **ddict)
                    graph.root = curr_obj_nid

                # Add new object as child of current object
                else:
                    child_obj_nid = graph.add_node(node_key, **ddict)
                    graph.add_edge(curr_obj_nid, child_obj_nid)
                    curr_obj_nid = child_obj_nid

                object_open_tags += 1

            # Detect end of object definition
            elif line.startswith(')'):

                # If there is data in the array store, add it to node
                if len(array_store):
                    array_node = graph.getnodes(curr_obj_nid)
                    array_node.is_array = True
                    array_node.set(graph.data.value_tag, array_store)

                # Reset array key counter
                array_key_counter = 1

                # Move one level up the object three
                curr_obj_nid = node_parent(graph, curr_obj_nid,
                                           graph.root) or graph.root
                object_close_tags += 1

            # Parse object parameters
            else:

                # Parse key,value pairs and add as leaf node
                params = [n.strip() for n in line.rstrip(',').split('=', 1)]

                if '=' in line and len(params) == 2:
                    leaf_nid = graph.add_node(params[0])
                    graph.add_edge(curr_obj_nid, leaf_nid)

                    value = params[1]
                    if auto_parse_format:
                        value = json_decode_params(params[1])

                    leaf_node = graph.getnodes(leaf_nid)
                    leaf_node.set(graph.data.value_tag, value)

                # Parse single values as array data
                elif len(params) == 1:

                    value = params[0]
                    if auto_parse_format:
                        value = json_decode_params(params[0])

                    # Store array items as nodes
                    array_store.append(value)

                else:
                    logger.warning(
                        'Unknown .web data formatting on line: {0}, {1}'.
                        format(i, line))

    web_file.close()

    # Object blocks opening '(' and closing ')' tag count should be balanced
    if object_open_tags != object_close_tags:
        raise AssertionError(
            'Unbalanced object block, something is wrong with the file format')

    # Restore original ORM
    graph.orm = curr_orm

    # Root is of type 'array', rename key from 'item1' to 'project'
    root = graph.getnodes(graph.root)
    root.key = 'project'

    return graph