def read_xml(xml_file, graph=None): """ Parse hierarchical XML data structure to a graph Uses the Python build-in etree cElementTree parser to parse the XML document and convert the elements into nodes. The XML element tag becomes the node key, XML text becomes the node value and XML attributes are added to the node as additional attributes. :param xml_file: XML data to parse :type xml_file: File, string, stream or URL :param graph: Graph object to import dictionary data in :type graph: :graphit:Graph :return: GraphAxis object :rtype: :graphit:GraphAxis """ # User defined or default GraphAxis object if graph is None: graph = GraphAxis() if not isinstance(graph, GraphAxis): raise TypeError('Unsupported graph type {0}'.format(type(graph))) # Try parsing the string using default Python cElementTree parser xml_file = open_anything(xml_file) try: tree = et.fromstring(xml_file.read()) except et.ParseError as error: logging.error( 'Unable to parse XML file. cElementTree error: {0}'.format(error)) return def walk_element_tree(element, parent=None): for child in element: child_data = child.attrib if child.text and len(child.text.strip()): child_data[graph.value_tag] = child.text.strip() nid = graph.add_node(child.tag, **child_data) graph.add_edge(parent, nid) walk_element_tree(child, parent=nid) is_empty = graph.empty() # Add root element element_data = tree.attrib if tree.text and len(tree.text.strip()): element_data[graph.value_tag] = tree.text.strip() rid = graph.add_node(tree.tag, **element_data) if is_empty: graph.root = rid # Recursive add XML elements as nodes walk_element_tree(tree, parent=graph.root) return graph
def read_pydata(data, graph=None, parser_classes=None, level=0): """ Parse (hierarchical) python data structures to a graph Many data formats are first parsed to a python structure before they are converted to a graph using the `read_pydata` function. The function supports any object that is an instance of, or behaves as, a Python dictionary, list, tuple or set and converts these (nested) structures to graph nodes and edges for connectivity. Data is stored in nodes using the node and edge 'key_tag' and 'value_tag' attributes in the Graph class. Data type and format information are also stored as part of the nodes to enable reconstruction of the Python data structure on export using the `write_pydata` function. Changing type and format on a node or edge allows for customized data export. Parsing of data structures to nodes and edges is handled by parser classes that need to define the methods `deserialize` for reading and `serialize` for writing. In `write_pydata` these classes are registered with the ORM to fully customize the use of the `serialize` method. In the `read_pydata` function the ORM cannot be used because the nodes/edges themselves do not yet exist. Instead they are provided as a dictionary through the `parser_classes` argument. The dictionary defines the string representation of the Python data type as key and parser class as value. Parser customization is important as Python data structures can be represented as a graph structure in different ways. This is certainly true for dictionaries where key/value pairs can be part of the node attributes, as separate nodes or as a combination of the two. `read_pydata` has quick support for two scenario's using the `level` argument: * level 0: every dictionary key/value pair is represented as a node regardless of its position in the nested data structure * level 1: all keys at the same level in the hierarchy that have a primitive type value are stored as part of the node attributes. If the `graph` is empty, the first node added to the graph is assigned as root node. If the `graph` is not empty, new nodes and edges will be added to it as subgraph. Edge connections between the two will have to be made afterwards. :param data: Python (hierarchical) data structure :param graph: GraphAxis object to import dictionary data in :type graph: :graphit:GraphAxis :param parser_classes: parser class definition for different Python data types. Updates default classes for level 0 or 1 :type parser_classes: :py:dict :param level: dictionary parsing mode :type level: :py:int :return: GraphAxis object :rtype: :graphit:GraphAxis """ # User defined or default GraphAxis object if graph is None: graph = GraphAxis() elif not isinstance(graph, GraphAxis): raise GraphitException('Unsupported graph type {0}'.format(type(graph))) # Determine parser classes to use based on level assert level in (0, 1), GraphitException('Unsupported level {0}. Required to be 0 or 1'.format(level)) if level == 0: parser_class_dict = copy.copy(ORMDEFS_LEVEL0) else: parser_class_dict = copy.copy(ORMDEFS_LEVEL1) # Update parser_class_dict with custom classes if any if isinstance(parser_classes, dict): parser_class_dict.update(parser_classes) # Define root if graph.empty(): graph.root = graph.data.nodeid # Start recursive parsing by calling the `deserialize` method on the parser object parser = parser_class_dict.get(return_instance_type(data), parser_class_dict['fallback']) p = parser() p.deserialize(data, graph, parser_class_dict) return graph
def read_json_schema(schema, graph=None, exclude_args=None, resolve_ref=True): """ Import hierarchical data structures defined in a JSON schema format :param schema: JSON Schema data format to import :type schema: dict, file, string, stream or URL :param graph: graph object to import TGF data in :type graph: :graphit:Graph :param exclude_args: JSON schema arguments to exclude from import :type exclude_args: :py:list :param resolve_ref: Parse JSON schema 'definitions' :type resolve_ref: :py:bool :return: Graph object :rtype: :graphit:Graph """ json_schema = schema if not isinstance(schema, dict): # Try parsing the string using default Python json parser json_schema = open_anything(schema) try: json_schema = json.load(json_schema) except (IOError, ValueError) as error: logger.error('Unable to decode JSON string: {0}'.format(error)) return # User defined or default Graph object if graph is None: graph = GraphAxis() elif not isinstance(graph, GraphAxis): raise GraphitException('Unsupported graph type {0}'.format( type(graph))) if graph.empty(): rid = graph.add_node('root') graph.root = rid # Build JSON schema parser ORM with format specific conversion classes graph.node_tools = JSONSchemaValidatorDraft07 graph.orm = JSONSchemaORMDraft07 # What data-blocks to parse, properties by default, definitions if required datablock = ['properties'] if resolve_ref: datablock.append('definitions') if exclude_args is None: exclude_args = [] def walk_schema(schema_block, parent=None): # Get all JSON schema definitions for this data instance attributes = dict([(k, v) for k, v in schema_block.items() if not isinstance(v, dict) and k not in exclude_args ]) node = graph.getnodes(parent) node.update(attributes) # Get 'required' attribute required = schema_block.get('required', []) if not isinstance(required, list): required = [] # Store default data or None if attributes.get('default') is not None: node.set(graph.data.value_tag, attributes.get('default')) # For all child elements in datablock, make new node # and parse using recursive calls to parse_schema for block in schema_block.keys(): if block in datablock: for child, attr in schema_block[block].items(): nid = graph.add_node(child) # Register block_name in child attributes attr['schema_label'] = block # Register 'required' elements if child in required: attr['required'] = True graph.add_edge(parent, nid) walk_schema(attr, parent=nid) walk_schema(json_schema, graph.root) # Parse schema meta data document_path = '' if isinstance(schema, PY_STRING): document_path = os.path.abspath(schema) root = graph.get_root() root.set('document_path', document_path) parse_schema_meta_data(root) # Resolve JSON Schema $ref if resolve_ref: resolve_json_ref(graph, exclude_args=exclude_args) return graph
def read_web(web, graph=None, orm_data_tag='haddock_type', auto_parse_format=True): """ Import hierarchical data structures defined in the Spider .web format The data block type identifiers used in the .web format are stored in the nodes using the `orm_data_tag` attribute. These can be used by the Graph ORM mapper for custom data exchange in the graph. :param web: Spider .web data format to import :type web: file, string, stream or URL :param graph: graph object to import TGF data in :type graph: :graphit:Graph :param orm_data_tag: data key to use for .web data identifier :type orm_data_tag: :py:str :param auto_parse_format: automatically detect basic format types using JSON decoding :type auto_parse_format: :py:bool :return: Graph object :rtype: :graphit:Graph """ web_file = open_anything(web) if graph is None: graph = GraphAxis() elif not isinstance(graph, GraphAxis): raise GraphitException('Unsupported graph type {0}'.format( type(graph))) # Build .web parser ORM with format specific conversion classes weborm = GraphORM() weborm.node_mapping.add( RestraintsInterface, lambda x: x.get(graph.data.key_tag) == 'activereslist') weborm.node_mapping.add( RestraintsInterface, lambda x: x.get(graph.data.key_tag) == 'passivereslist') # Set current ORM aside and register parser ORM. curr_orm = graph.orm graph.orm = weborm curr_obj_nid = None object_open_tags = 0 object_close_tags = 0 array_key_counter = 1 array_store = [] for i, line in enumerate(web_file.readlines()): line = line.strip() if len(line): # Detect start of new object definition if line.endswith('('): # Process data meta_data = [n.strip() for n in line.strip('(').split('=', 1)] ddict = {orm_data_tag: meta_data[-1], 'is_array': False} if len(meta_data) > 1: node_key = meta_data[0] else: node_key = 'item{0}'.format(array_key_counter) ddict['is_array'] = True array_key_counter += 1 # Clear the array store array_store = [] # First object defines graph root if graph.empty(): curr_obj_nid = graph.add_node(node_key, **ddict) graph.root = curr_obj_nid # Add new object as child of current object else: child_obj_nid = graph.add_node(node_key, **ddict) graph.add_edge(curr_obj_nid, child_obj_nid) curr_obj_nid = child_obj_nid object_open_tags += 1 # Detect end of object definition elif line.startswith(')'): # If there is data in the array store, add it to node if len(array_store): array_node = graph.getnodes(curr_obj_nid) array_node.is_array = True array_node.set(graph.data.value_tag, array_store) # Reset array key counter array_key_counter = 1 # Move one level up the object three curr_obj_nid = node_parent(graph, curr_obj_nid, graph.root) or graph.root object_close_tags += 1 # Parse object parameters else: # Parse key,value pairs and add as leaf node params = [n.strip() for n in line.rstrip(',').split('=', 1)] if '=' in line and len(params) == 2: leaf_nid = graph.add_node(params[0]) graph.add_edge(curr_obj_nid, leaf_nid) value = params[1] if auto_parse_format: value = json_decode_params(params[1]) leaf_node = graph.getnodes(leaf_nid) leaf_node.set(graph.data.value_tag, value) # Parse single values as array data elif len(params) == 1: value = params[0] if auto_parse_format: value = json_decode_params(params[0]) # Store array items as nodes array_store.append(value) else: logger.warning( 'Unknown .web data formatting on line: {0}, {1}'. format(i, line)) web_file.close() # Object blocks opening '(' and closing ')' tag count should be balanced if object_open_tags != object_close_tags: raise AssertionError( 'Unbalanced object block, something is wrong with the file format') # Restore original ORM graph.orm = curr_orm # Root is of type 'array', rename key from 'item1' to 'project' root = graph.getnodes(graph.root) root.key = 'project' return graph