def _scanText(self, k): """ try to match in buffer regardless of word boundaries using Elly vocabulary and pattern tables and also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , match types , vocabulary match chars , suffix removed ] """ # print ( '_scanText k=' , k ) sb = self.sbu.buffer # input buffer # match status nspan = 0 # total span of match mtype = '' # no match type yet vmchs = [] # chars of vocabulary entry matched suffx = '' # any suffix removed in match lm = len(sb) # scan limit # print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] ) if self.vtb != None: # look in external dictionary first, if it exists if k > 1: # is first component a single char? ks = k # if not, use this for indexing else: ks = 1 # otherwise, add on any following alphanumeric while ks < lm: # if not ellyChar.isLetterOrDigit(sb[ks]): break ks += 1 ss = ''.join(sb[:ks]) # where to start for indexing # print ( 'ss=' , ss ) n = vocabularyTable.delimitKey(ss) # get actual indexing # print ( 'n=' , n ) rl = self.vtb.lookUp(sb, n) # get list of the longest matches if len(rl) > 0: # # print ( 'len(rl)=' , len(rl) ) r0 = rl[0] # look at first record nspan = r0.nspan # should be same for all matches mtype = 'Vt' vmchs = r0.vem.chs # suffx = r0.suffx # # print ( 'vocabulary m=' , nspan ) d = self.rul # grammar rule definitions m = d.ptb.match(sb, self.ptr) # try entity by pattern match next # print ( 'pattern m=' , m ) if nspan < m: nspan = m # on longer match, update maximum mtype = 'Fa' elif m > 0 and nspan == m: mtype = 'VtFa' # print ( 'mtype=' , mtype ) m = self.iex.run(sb) # try entity extractors next # print ( 'extractor m=' , m ) if nspan < m: nspan = m # on longer match, update maximum mtype = 'Ee' elif m > 0 and nspan == m: mtype += 'Ee' # unchanged match length, add type # print ( 'maximum match=' , nspan ) # print ( 'mtype=' , mtype ) # print ( 'input=' , self.sbu.buffer[:nspan] ) return [nspan, mtype, vmchs, suffx]
def _scanText ( self , k ): """ try to match in buffer regardless of word boundaries using Elly vocabulary, pattern, amd template tables an also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , vocabulary match , suffix removed ] exceptions: ParseOverflow """ # print '_scanText k=' , k sb = self.sbu.buffer # input buffer tr = self.ptr # parse tree for results # print '_scanText sb=' , sb # initialize match status nspan = 0 # total span of match vmchs = [ ] # chars of vocabulary entry matched suffx = '' # any suffix removed in match d = self.rul # grammar rule definitions m = d.ptb.match(sb,tr) # try token by pattern match next # print 'pattern m=' , m if nspan < m: nspan = m # on longer match, update maximum m = d.ctb.match(sb,tr) # try multi-word template match next # print 'template m=' , m if nspan < m: nspan = m # on longer match, update maximum m = self.iex.run(sb) # try entity extractors next # print 'extractor m=' , m if nspan < m: nspan = m # on longer match, update maximum # lm = len(sb) # scan limit # print 'lm=' , lm , 'm=' , m capd = ellyChar.isUpperCaseLetter(sb[0]) # print 'next component=' , sb[:k] , ', context=' , sb[k:lm] if self.vtb != None: # look in external dictionary, if it exists ls = list(sb[:k]) # print 'ls 0=' , ls ellyChar.toLowerCaseASCII(ls) ss = u''.join(ls) # where to start for vocabulary indexing # print 'ls 1=' , ls n = vocabularyTable.delimitKey(ss) # get actual indexing # print 'delimiting n=' , n , '=' , '<' + ss[:n] + '>' # print vocabularyTable.listDBKeys(self.vtb.cdb) rl = self.vtb.lookUp(sb,n) # get list of the maximum text matches # print len(rl) , 'matches' if len(rl) > 0: # r0 = rl[0] # look at first record # print 'r0=' , r0 vmln = r0.nspan # should be same for all matches vchs = r0.vem.chs # vsfx = r0.suffx # # print 'nspan=' , vmln , vsfx if ( vmln > nspan or vmln == nspan and vsfx == '' ): nspan = vmln # keep vocabulary matches vmchs = vchs # suffx = vsfx # for r in rl: ve = r.vem # get vocabulary entry # print 've=' , ve # if ve.gen != None: print 've.gen=' , ve.gen if tr.addLiteralPhraseWithSemantics( ve.cat,ve.syf,ve.smf,ve.bia,ve.gen,len(suffx) > 0): tr.lastph.lens = nspan # char length of leaf phrase node # needed for later selection tr.lastph.krnl.cncp = ve.con if capd: tr.lastph.krnl.semf.set(0) # print 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens if suffx != '': if ellyChar.isApostrophe(suffx[1]): tr.lastph.krnl.usen = 0 # print 'vocabulary m=' , vmln # print 'queue after table lookup:' , len(self.ptr.queue) # print 'sb=' , sb # print 'maximum match=' , nspan # print 'input=' , self.sbu.buffer[:nspan] if nspan > 0: # any matches at all? tr.requeue() # if so, keep only longest of them # print 'queue after scan:' , len(self.ptr.queue) # print 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' return [ nspan , vmchs , suffx ]
def _scanText(self, k): """ try to match in buffer regardless of word boundaries using Elly vocabulary, pattern, amd template tables an also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , vocabulary match , suffix removed ] exceptions: ParseOverflow """ # print ( '_scanText k=' , k ) sb = self.sbu.buffer # input buffer tr = self.ptr # parse tree for results # print ( '_scanText sb=' , sb ) # initialize match status nspan = 0 # total span of match vmchs = [] # chars of vocabulary entry matched suffx = '' # any suffix removed in match d = self.rul # grammar rule definitions m = d.ptb.match(sb, tr) # try token by pattern match next # print ( 'pattern m=' , m ) if nspan < m: nspan = m # on longer match, update maximum m = d.ctb.match(sb, tr) # try multi-word template match next # print ( 'template m=' , m ) if nspan < m: nspan = m # on longer match, update maximum m = self.iex.run(sb) # try entity extractors next # print ( 'extractor m=' , m ) if nspan < m: nspan = m # on longer match, update maximum # print ( 'nspan=' , nspan, sb[:nspan] ) lm = len(sb) # scan limit # print ( 'lm=' , lm , 'm=' , m ) capd = ellyChar.isUpperCaseLetter(sb[0]) # print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] ) if self.vtb != None: # look in external dictionary, if it exists ls = list(sb[:k]) # print ( 'vtb ls 0=' , ls ) ellyChar.toLowerCaseASCII(ls) ss = ''.join(ls) # where to start for vocabulary indexing # print ( 'vtb ls 1=' , ls ) n = vocabularyTable.delimitKey(ss) # get actual indexing # print ( 'delimiting n=' , n , ':' , '<' + ss[:n] + '>' ) # print ( vocabularyTable.listDBKeys(self.vtb.cdb) ) rl = self.vtb.lookUp(sb, n) # get list of the maximum text matches # print ( 'external matches=' , len(rl) ) # print ( 'input text=' , sb ) if len(rl) > 0: # r0 = rl[0] # look at first record # print ( 'r0=' , r0 ) vmln = r0.nspan # should be same for all matches vchs = r0.vem.chs # vsfx = r0.suffx # # print ( 'nspan=' , vmln , vsfx ) if (vmln > nspan or vmln == nspan and vsfx == ''): nspan = vmln # keep vocabulary matches vmchs = vchs # suffx = vsfx # for r in rl: ve = r.vem # get vocabulary entry # print ( 've=' , ve ) # if ve.gen != None: print ( 've.gen=' , ve.gen ) if tr.addLiteralPhraseWithSemantics( ve.cat, ve.syf, ve.smf, ve.bia, ve.gen, len(suffx) > 0): tr.lastph.lens = nspan # char length of leaf phrase node # needed for later selection tr.lastph.krnl.cncp = ve.con if capd: tr.lastph.krnl.semf.set(0) # print ( 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens ) if suffx != '': if ellyChar.isApostrophe(suffx[1]): tr.lastph.krnl.usen = 0 # print ( 'vocabulary m=' , vmln ) # print ( 'queue after table lookup:' , len(self.ptr.queue) ) # print ( 'vtb sb=' , sb ) # print ( 'maximum match=' , nspan ) # print ( 'next input=' , sb[:nspan] ) if nspan > 0: # any matches at all? tr.requeue() # if so, keep only longest of them # print ( 'queue after scan:' , len(self.ptr.queue) ) # print ( 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' ) return [nspan, vmchs, suffx]
def _scanText ( self , k ): """ try to match in buffer regardless of word boundaries using Elly vocabulary and pattern tables and also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , match types , vocabulary match chars , suffix removed ] """ # print '_scanText k=' , k sb = self.sbu.buffer # input buffer # match status nspan = 0 # total span of match mtype = '' # no match type yet vmchs = [ ] # chars of vocabulary entry matched suffx = '' # any suffix removed in match lm = len(sb) # scan limit # print 'next component=' , sb[:k] , ', context=' , sb[k:lm] if self.vtb != None: # look in external dictionary first, if it exists if k > 1: # is first component a single char? ks = k # if not, use this for indexing else: ks = 1 # otherwise, add on any following alphanumeric while ks < lm: # if not ellyChar.isLetterOrDigit(sb[ks]): break ks += 1 ss = u''.join(sb[:ks]) # where to start for indexing # print 'ss=' , ss n = vocabularyTable.delimitKey(ss) # get actual indexing # print 'n=' , n rl = self.vtb.lookUp(sb,n) # get list of the longest matches if len(rl) > 0: # # print 'len(rl)=' , len(rl) r0 = rl[0] # look at first record nspan = r0.nspan # should be same for all matches mtype = 'Vt' vmchs = r0.vem.chs # suffx = r0.suffx # # print 'vocabulary m=' , nspan d = self.rul # grammar rule definitions m = d.ptb.match(sb,self.ptr) # try entity by pattern match next # print 'pattern m=' , m if nspan < m: nspan = m # on longer match, update maximum mtype = 'Fa' elif m > 0 and nspan == m: mtype = 'VtFa' # print 'mtype=' , mtype m = self.iex.run(sb) # try entity extractors next # print 'extractor m=' , m if nspan < m: nspan = m # on longer match, update maximum mtype = 'Ee' elif m > 0 and nspan == m: mtype += 'Ee' # unchanged match length, add type # print 'maximum match=' , nspan # print 'mtype=' , mtype # print 'input=' , self.sbu.buffer[:nspan] return [ nspan , mtype , vmchs , suffx ]