Пример #1
0
    def _scanText(self, k):
        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary and pattern tables and also
        running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , match types , vocabulary match chars , suffix removed ]
        """

        #       print ( '_scanText k=' , k )
        sb = self.sbu.buffer  # input buffer

        # match status
        nspan = 0  #   total span of match
        mtype = ''  #   no match type yet
        vmchs = []  #   chars of vocabulary entry matched
        suffx = ''  #   any suffix removed in match

        lm = len(sb)  # scan limit
        #       print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] )

        if self.vtb != None:  # look in external dictionary first, if it exists
            if k > 1:  # is first component a single char?
                ks = k  # if not, use this for indexing
            else:
                ks = 1  # otherwise, add on any following alphanumeric
                while ks < lm:  #
                    if not ellyChar.isLetterOrDigit(sb[ks]):
                        break
                    ks += 1
            ss = ''.join(sb[:ks])  # where to start for indexing
            #           print ( 'ss=' , ss )
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
            #           print ( 'n=' , n )
            rl = self.vtb.lookUp(sb, n)  # get list of the longest matches
            if len(rl) > 0:  #
                #               print ( 'len(rl)=' , len(rl) )
                r0 = rl[0]  # look at first record
                nspan = r0.nspan  # should be same for all matches
                mtype = 'Vt'
                vmchs = r0.vem.chs  #
                suffx = r0.suffx  #

#       print ( 'vocabulary m=' , nspan )

        d = self.rul  # grammar rule definitions

        m = d.ptb.match(sb, self.ptr)  # try entity by pattern match next
        #       print ( 'pattern m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum
            mtype = 'Fa'
        elif m > 0 and nspan == m:
            mtype = 'VtFa'

#       print ( 'mtype=' , mtype )

        m = self.iex.run(sb)  # try entity extractors next
        #       print ( 'extractor m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum
            mtype = 'Ee'
        elif m > 0 and nspan == m:
            mtype += 'Ee'  # unchanged match length, add type

#       print ( 'maximum match=' , nspan )
#       print ( 'mtype=' , mtype )
#       print ( 'input=' , self.sbu.buffer[:nspan] )

        return [nspan, mtype, vmchs, suffx]
Пример #2
0
    def _scanText ( self , k ):

        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

#       print '_scanText k=' , k
        sb = self.sbu.buffer           # input buffer
        tr = self.ptr                  # parse tree for results

#       print '_scanText sb=' , sb
                                       # initialize match status
        nspan = 0                      #   total span of match
        vmchs = [ ]                    #   chars of vocabulary entry matched
        suffx = ''                     #   any suffix removed in match

        d = self.rul                   # grammar rule definitions

        m = d.ptb.match(sb,tr)         # try token by pattern match next
#       print 'pattern m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = d.ctb.match(sb,tr)         # try multi-word template  match next
#       print 'template m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = self.iex.run(sb)           # try entity extractors next
#       print 'extractor m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

#       lm = len(sb)                   # scan limit
#       print 'lm=' , lm , 'm=' , m
        capd = ellyChar.isUpperCaseLetter(sb[0])
#       print 'next component=' , sb[:k] , ', context=' , sb[k:lm]

        if self.vtb != None:           # look in external dictionary, if it exists
            ls = list(sb[:k])
#           print 'ls 0=' , ls
            ellyChar.toLowerCaseASCII(ls)
            ss = u''.join(ls)                   # where to start for vocabulary indexing
#           print 'ls 1=' , ls
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
#           print 'delimiting n=' , n , '=' , '<' + ss[:n] + '>'
#           print vocabularyTable.listDBKeys(self.vtb.cdb)

            rl = self.vtb.lookUp(sb,n) # get list of the maximum text matches
#           print len(rl) , 'matches'
            if len(rl) > 0:            #
                r0 = rl[0]             # look at first record
#               print 'r0=' , r0
                vmln = r0.nspan        # should be same for all matches
                vchs = r0.vem.chs      #
                vsfx = r0.suffx        #
#               print 'nspan=' , vmln , vsfx

                if ( vmln > nspan or
                     vmln == nspan and vsfx == '' ):

                    nspan = vmln       # keep vocabulary matches
                    vmchs = vchs       #
                    suffx = vsfx       #

                    for r in rl:
                        ve = r.vem     # get vocabulary entry
#                       print 've=' , ve
#                       if ve.gen != None: print 've.gen=' , ve.gen
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat,ve.syf,ve.smf,ve.bia,ve.gen,len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                                                    # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print 'vocabulary m=' , vmln
#               print 'queue after table lookup:' , len(self.ptr.queue)

#           print 'sb=' , sb

#       print 'maximum match=' , nspan
#       print 'input=' , self.sbu.buffer[:nspan]

        if nspan > 0:                  # any matches at all?
            tr.requeue()               # if so, keep only longest of them
#       print 'queue after scan:' , len(self.ptr.queue)

#       print 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']'
        return [ nspan , vmchs , suffx ]
Пример #3
0
    def _scanText(self, k):
        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

        #       print ( '_scanText k=' , k )
        sb = self.sbu.buffer  # input buffer
        tr = self.ptr  # parse tree for results

        #       print ( '_scanText sb=' , sb )
        # initialize match status
        nspan = 0  #   total span of match
        vmchs = []  #   chars of vocabulary entry matched
        suffx = ''  #   any suffix removed in match

        d = self.rul  # grammar rule definitions

        m = d.ptb.match(sb, tr)  # try token by pattern match next
        #       print ( 'pattern m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = d.ctb.match(sb, tr)  # try multi-word template  match next
        #       print ( 'template m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = self.iex.run(sb)  # try entity extractors next
        #       print ( 'extractor m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

#       print ( 'nspan=' , nspan, sb[:nspan] )

        lm = len(sb)  # scan limit
        #       print ( 'lm=' , lm , 'm=' , m )
        capd = ellyChar.isUpperCaseLetter(sb[0])
        #       print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] )

        if self.vtb != None:  # look in external dictionary, if it exists
            ls = list(sb[:k])
            #           print ( 'vtb ls 0=' , ls )
            ellyChar.toLowerCaseASCII(ls)
            ss = ''.join(ls)  # where to start for vocabulary indexing
            #           print ( 'vtb ls 1=' , ls )
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
            #           print ( 'delimiting n=' , n , ':' , '<' + ss[:n] + '>' )
            #           print ( vocabularyTable.listDBKeys(self.vtb.cdb) )

            rl = self.vtb.lookUp(sb, n)  # get list of the maximum text matches
            #           print ( 'external matches=' , len(rl) )
            #           print ( 'input text=' , sb )

            if len(rl) > 0:  #
                r0 = rl[0]  # look at first record
                #               print ( 'r0=' , r0 )
                vmln = r0.nspan  # should be same for all matches
                vchs = r0.vem.chs  #
                vsfx = r0.suffx  #
                #               print ( 'nspan=' , vmln , vsfx )

                if (vmln > nspan or vmln == nspan and vsfx == ''):

                    nspan = vmln  # keep vocabulary matches
                    vmchs = vchs  #
                    suffx = vsfx  #

                    for r in rl:
                        ve = r.vem  # get vocabulary entry
                        #                       print ( 've=' , ve )
                        #                       if ve.gen != None: print ( 've.gen=' , ve.gen )
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat, ve.syf, ve.smf, ve.bia, ve.gen,
                                len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                            # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print ( 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens )
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print ( 'vocabulary m=' , vmln )
#               print ( 'queue after table lookup:' , len(self.ptr.queue) )

#           print ( 'vtb sb=' , sb )

#       print ( 'maximum match=' , nspan )
#       print ( 'next input=' , sb[:nspan] )

        if nspan > 0:  # any matches at all?
            tr.requeue()  # if so, keep only longest of them
#       print ( 'queue after scan:' , len(self.ptr.queue) )

#       print ( 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' )
        return [nspan, vmchs, suffx]
Пример #4
0
    def _scanText ( self , k ):

        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary and pattern tables and also
        running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , match types , vocabulary match chars , suffix removed ]
        """

#       print '_scanText k=' , k
        sb = self.sbu.buffer           # input buffer

                                       # match status
        nspan = 0                      #   total span of match
        mtype = ''                     #   no match type yet
        vmchs = [ ]                    #   chars of vocabulary entry matched
        suffx = ''                     #   any suffix removed in match

        lm = len(sb)                   # scan limit
#       print 'next component=' , sb[:k] , ', context=' , sb[k:lm]

        if self.vtb != None:           # look in external dictionary first, if it exists
            if  k > 1:                 # is first component a single char?
                ks = k                 # if not, use this for indexing
            else:
                ks = 1                 # otherwise, add on any following alphanumeric
                while ks < lm:         #
                    if not ellyChar.isLetterOrDigit(sb[ks]):
                        break
                    ks += 1
            ss = u''.join(sb[:ks])     # where to start for indexing
#           print 'ss=' , ss
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
#           print 'n=' , n
            rl = self.vtb.lookUp(sb,n) # get list of the longest matches
            if len(rl) > 0:            #
#               print 'len(rl)=' , len(rl)
                r0 = rl[0]             # look at first record
                nspan = r0.nspan       # should be same for all matches
                mtype = 'Vt'
                vmchs = r0.vem.chs     #
                suffx = r0.suffx       #

#       print 'vocabulary m=' , nspan

        d = self.rul                   # grammar rule definitions

        m = d.ptb.match(sb,self.ptr)   # try entity by pattern match next
#       print 'pattern m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum
            mtype = 'Fa'
        elif m > 0 and nspan == m:
            mtype = 'VtFa'

#       print 'mtype=' , mtype

        m = self.iex.run(sb)           # try entity extractors next
#       print 'extractor m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum
            mtype = 'Ee'
        elif m > 0 and nspan == m:
            mtype += 'Ee'               # unchanged match length, add type

#       print 'maximum match=' , nspan
#       print 'mtype=' , mtype
#       print 'input=' , self.sbu.buffer[:nspan]

        return [ nspan , mtype , vmchs , suffx ]