def getEHRClasses(patientXML, children=True, parents=True, duplicates=False): if ( duplicates ): ehrClasses = Utilities.getXMLElements(patientXML, {}, children, parents, duplicates); allValues = []; for depth in ehrClasses: allValues += ehrClasses[depth]; return allValues; else: # Combines all values in dictionary of EHR depths. return [element.tag for element in set(set().union(*list(Utilities.getXMLElements(patientXML, {}, children, parents, duplicates).values())))];
def getEHRClassChildren(patientXML, ehrClass, children=True, parents=False, allEHRChildren=False, contextualiseChildren=True, removeGerunds=True): ehrClassChildren = {}; for ehrClassExample in patientXML.findall(".//" + ehrClass): ehrClassExampleDepthsToChildren = Utilities.getXMLElements(ehrClassExample, {}, children, parents, False, True, True); if 0 in list(ehrClassExampleDepthsToChildren.keys()): for element in ehrClassExampleDepthsToChildren[0]: # Contextualise those EHR children that do not give enough context on their own, because they are just generic children. if ( contextualiseChildren and element.tag.lower() in TranslationConstants.FIELDS_THAT_INDICATE_RESOURCE_CAN_HOLD_ANY_DATA ): # Work out how to present this new compound child (child + parent name), based on which separators are used by this EHR. if ( TranslationConstants.SEPARATOR != "" ): element.tag = ehrClass + TranslationConstants.SEPARATOR + element.tag; else: element.tag = ehrClass[0].upper() + ehrClass[1:] + element.tag; ehrClassChildren.setdefault(ehrClass, []).extend([element.tag]); # If an EHR word begins with a gerund (such as 'Managing' in 'ManagingOrganisation'), this potentially complicates the context of the word, and so should be accounted for. Remove gerunds AND add the gerund free version as an additional EHR child. if ( removeGerunds ): ehrClassChildren.setdefault(ehrClass, []).extend([TranslationUtilities.removeGerund(element.tag)]); # As we may have multiple examples of an EHR class in an example piece of marked up data from an EHR vendor, we want to find all possible examples of children that can be listed under that class. if ( not allEHRChildren ): break; return ehrClassChildren;
def ehrClassToExamples(patientXML): # First get unique examples of EHR classes, to be expanded upon later to find all examples noDuplicateEHRClassesAtDepths = Utilities.getXMLElements(patientXML, {}, False, True, False, True, True); for depth in range(len(noDuplicateEHRClassesAtDepths) -1, 0, -1): # Expand EHR classes with children that are a subset of one or more other EHR classes with the same name to include the additional children held by that other class. This method will not ensure EHR classes are subsumed by larger EHR classes created during the expansion process. for ehrClass in noDuplicateEHRClassesAtDepths[depth]: allParentsAndChildren = []; # Only append afterwards to retain structure throughout processing. Will not contain all parents. childrenToAppend = {}; # Store so we get the same memory IDs later for debugging. ehrClassExamples = patientXML.findall(".//" + ehrClass.tag); # Find all examples from original unique list. for ehrClassExample in ehrClassExamples: if ( len(ehrClassExample.getchildren()) == 0 ): continue; allParentsAndChildren.append( (ehrClassExample, ehrClassExample.getchildren() ) ); if ( len(allParentsAndChildren) == 0 ): continue; # Sort so that the the parents with the least children are considered first. allParentsAndChildren.sort(key=lambda t: len(t[1]), reverse=True) for parentAndChildren in allParentsAndChildren: for parentAndOtherChildren in allParentsAndChildren: children = parentAndChildren[1]; otherParent = parentAndOtherChildren[0] otherChildren = parentAndOtherChildren[1]; # If trying to compare to self as part of nested loop, skip. if ( children == otherChildren or str([element.tag for element in otherChildren]) == str([element.tag for element in children]) ): continue; # EHRname - set of children; sameEHRname - other set of children. If the childen of sameEHRname are a subset of the children of EHRname, then (plan to) add the children of EHRname that are not in sameEHRname to sameEHRname. if ( set([element.tag for element in otherChildren]).issubset(set([element.tag for element in children])) ): for child in children: if child.tag not in str([element.tag for element in otherChildren]): # Prepare to add later to avoid changing structure during processing. childrenToAppend.setdefault(otherParent, []).append(child); # EHR classes with the same name that still have different children after this processing are given different numeric suffixes so they are treated as different entities. To do this, this dictionary is introduced to link unique, sorted sets of children to parent names. childrenToNewTagName = {}; for ehrClassExample in ehrClassExamples: if ( len(ehrClassExample.getchildren()) == 0 ): continue; # If we don't have a record of this instance of an EHR class name (e.g. one instance of a ClinicalCode element, when lots of ClinicalCode elements exist in the document.), then it is either the first example of this EHR class name (+ children) combination that we have extracted, or it is a new EHR class name + children combination, in which case it should be recorded with a new incremented numerical suffix. As such, childrenToNewTagName holds a record of all unique EHRname:children combinations. if ( str(sorted([element.tag for element in ehrClassExample.getchildren()])) not in list(childrenToNewTagName.keys()) ): # Incrementally name. if (len(childrenToNewTagName) > 0): ehrClassExample.tag = ehrClassExample.tag + str(len(childrenToNewTagName)); # Record the mapping between this set of EHR children and the new tag name. We order the set (to ensure different permutations are ignored when indexing), and if the EHR happens to have multiple children with the same tag, don't use this as part of the index. childrenToNewTagName[str(sorted([element.tag for element in ehrClassExample.getchildren()]))] = ehrClassExample.tag; else: # If we do have a record of this EHR class name + children combination, then it has already been given a numerical suffix (or none, if it was the first instance in the document), so this EHR element example should be given the same numerical suffix, as along with its children it represents the same entity. ehrClassExample.tag = childrenToNewTagName[str(sorted([element.tag for element in ehrClassExample.getchildren()]))]; return patientXML;