示例#1
0
    def parseXML(self, tNode, index, sentence):
        """ create a new token from an xml token element.
            tNode = xml token element
            index = the index of the element in the sentence (0 indexed)
            sentence = the Sentence object containing this token
            """
        self.sentence = sentence
        self.index = index

        self.text = xmlutil.normalizeText(tNode.getAttribute('text'))
        if self.index == 0 and self.text[0] >= 'A' and self.text[0] <= 'Z' \
                and (len(self.text) == 1 or (self.text[1] >= 'a' and self.text[1] <= 'z')):
            # first word in the sentence is capitalized and is not part of an acronym
            self.text = self.text.lower()

        self.lemma = xmlutil.normalizeText(tNode.getAttribute('lemma'))
        if len(self.lemma) == 0:
            self.lemma = self.text

        self.pos = tNode.getAttribute('pos')
        if self.pos == None:
            self.pos = ''

        dNodes = tNode.getElementsByTagName('dep')
        self.dependents = parsetree.DependencyList(dNodes)

        gNodes = tNode.getElementsByTagName('gov')
        self.governors = parsetree.DependencyList(gNodes)
        for gov in self.governors:
            if gov.index == self.index:
                #         print 'Governor index matches dependent index'
                #         print self.text
                #         print self.sentence.toString()
                #         sys.exit()
                self.governors.remove(gov)

        aNodes = tNode.getElementsByTagName('annotation')
        self.annotations = AnnotationList(aNodes)

        lNodes = tNode.getElementsByTagName('label')
        self.labels = AnnotationList(lNodes)

        sNodes = tNode.getElementsByTagName('semantic')
        for node in sNodes:
            semTag = xmlutil.getText(node)
            self.semanticTags.add(semTag)

        uNodes = tNode.getElementsByTagName('umls')
        for node in uNodes:
            self.umlsConcepts.append(umlsconcept.UMLSConcept(node))
示例#2
0
 def parse(self, parseString):
   """ take a parse tree string in penn treebank style 
       and parse it and add the children of the current node.
       input string is assumed to be the following
              ' (TYPE X) ...'
      where X may be a list of subtrees for this node (self).
      returns the list of references to token nodes that are decendents 
      of this node.
      also returns the remaining parse tree string that still needs to be parsed
   """
   tokenNodeList = []
   
   if len(parseString) > 0:
     parseString = parseString.lstrip()  # remove leading whitespace
     parseString = parseString.lstrip('(')         # remove leading parenthesis
     # remove the phrase type from the front of the string
     [self.type, space, parseString] = parseString.partition(' ')
     
     if parseString[0] != '(':
       # current node is a token node
       [self.text, rParen, parseString] = parseString.partition(')')
       self.text = xmlutil.normalizeText(self.text)
       tokenNodeList = [self]
     else:
       # current node is an internal node with children
       # process children of this node until we hit end of phrase
       while len(parseString) > 0 and parseString[0] != ')':
         # a phrase is next, parse it
         newNode = ParseTreeNode(self)
         [list, parseString] = newNode.parse(parseString)
         tokenNodeList = tokenNodeList + list
         self.childNodes.append(newNode)
       # remove right paren that marks end of current phrase  
       parseString = parseString[1:]
         
   return [tokenNodeList, parseString]