def parseXML(self, tNode, index, sentence): """ create a new token from an xml token element. tNode = xml token element index = the index of the element in the sentence (0 indexed) sentence = the Sentence object containing this token """ self.sentence = sentence self.index = index self.text = xmlutil.normalizeText(tNode.getAttribute('text')) if self.index == 0 and self.text[0] >= 'A' and self.text[0] <= 'Z' \ and (len(self.text) == 1 or (self.text[1] >= 'a' and self.text[1] <= 'z')): # first word in the sentence is capitalized and is not part of an acronym self.text = self.text.lower() self.lemma = xmlutil.normalizeText(tNode.getAttribute('lemma')) if len(self.lemma) == 0: self.lemma = self.text self.pos = tNode.getAttribute('pos') if self.pos == None: self.pos = '' dNodes = tNode.getElementsByTagName('dep') self.dependents = parsetree.DependencyList(dNodes) gNodes = tNode.getElementsByTagName('gov') self.governors = parsetree.DependencyList(gNodes) for gov in self.governors: if gov.index == self.index: # print 'Governor index matches dependent index' # print self.text # print self.sentence.toString() # sys.exit() self.governors.remove(gov) aNodes = tNode.getElementsByTagName('annotation') self.annotations = AnnotationList(aNodes) lNodes = tNode.getElementsByTagName('label') self.labels = AnnotationList(lNodes) sNodes = tNode.getElementsByTagName('semantic') for node in sNodes: semTag = xmlutil.getText(node) self.semanticTags.add(semTag) uNodes = tNode.getElementsByTagName('umls') for node in uNodes: self.umlsConcepts.append(umlsconcept.UMLSConcept(node))
def parse(self, parseString): """ take a parse tree string in penn treebank style and parse it and add the children of the current node. input string is assumed to be the following ' (TYPE X) ...' where X may be a list of subtrees for this node (self). returns the list of references to token nodes that are decendents of this node. also returns the remaining parse tree string that still needs to be parsed """ tokenNodeList = [] if len(parseString) > 0: parseString = parseString.lstrip() # remove leading whitespace parseString = parseString.lstrip('(') # remove leading parenthesis # remove the phrase type from the front of the string [self.type, space, parseString] = parseString.partition(' ') if parseString[0] != '(': # current node is a token node [self.text, rParen, parseString] = parseString.partition(')') self.text = xmlutil.normalizeText(self.text) tokenNodeList = [self] else: # current node is an internal node with children # process children of this node until we hit end of phrase while len(parseString) > 0 and parseString[0] != ')': # a phrase is next, parse it newNode = ParseTreeNode(self) [list, parseString] = newNode.parse(parseString) tokenNodeList = tokenNodeList + list self.childNodes.append(newNode) # remove right paren that marks end of current phrase parseString = parseString[1:] return [tokenNodeList, parseString]