def get ( self , ts , n=N ): """ get normalized substring in lower case for subsequent comparisons arguments: self - ts - list of chars to get substring from n - limit on count of chars to get returns: count of chars scanned for substring """ sl = [ ] # char sublist to be matched # print 'ts=' , ts lts = len(ts) if lts == 0: return 0 # no chars to scan lm = lts if lts < n else n # print 'lm=' , lm i = 0 c = '' while i < lm: # scan input text up to char limit lc = c c = ts[i] # get next char if c == PERIOD: # special treatment of PERIOD if lc == PERIOD: break elif c == COMMA: # special treatment of COMMA # print 'comma' if ( not ellyChar.isDigit(lc) or i + 1 == lm or not ellyChar.isDigit(ts[i + 1]) ): break else: if not ellyChar.isLetterOrDigit(c): # stop if not letter if not c in ALSO: break # or "'" or "/" or "-" sl.append(c.lower()) # otherwise append to sublist i += 1 # print 'i=' , i , '<' + c + '>' if i < lm and ellyChar.isLetterOrDigit(ts[i]): # proper termination? return 0 # if not, reject substring self.string = u''.join(sl) return i # scan count
def __init__ ( self , syms , fets=None , semantic=False ): """ initialization arguments: self - syms - symbol table fets - string representation of feature set semantic - flag for semantic features exceptions: FormatFailure on error """ if syms == None or fets == None: # special case generating zero feature set self.positive = ellyBits.EllyBits(symbolTable.FMAX) self.negative = ellyBits.EllyBits(symbolTable.FMAX) self.id = '' return segm = fets.lower() # print "features=",segm,"semantic=",semantic if segm == None or len(segm) < 3 or segm[0] != '[' or segm[-1] != ']': raise ellyException.FormatFailure elif segm[1] == ' ' or ellyChar.isLetterOrDigit(segm[1]) or segm[1] == '*': raise ellyException.FormatFailure else: self.id = segm[1] # print "id=",self.id fs = syms.getFeatureSet(segm[1:-1] , semantic) if fs == None: raise ellyException.FormatFailure self.positive , self.negative = fs
def bound ( segm ): """ get maximum limit on string for pattern matching (override this method if necessary) arguments: segm - text segment to match against returns: char count """ lm = len(segm) # limit can be up to total length of text for matching ll = 0 while ll < lm: # look for first space in text segment if segm[ll] == ' ': break ll += 1 # print 'll=' , ll , ', lm=' , lm ll -= 1 while ll > 0: # exclude trailing non-alphanumeric from matching # except for '.' and '*' and bracketing c = segm[ll] if c in Trmls or c == '*' or ellyChar.isLetterOrDigit(c): break ll -= 1 return ll + 1
def bound ( segm ): """ get maximum limit on string for template matching (override this method if necessary) arguments: segm - text segment to match against returns: char count """ # print 'segm=' , segm lm = len(segm) # limit can be up to total length of text for matching ll = 0 while ll < lm: # look for first break in text segment c = segm[ll] if c in [ ellyChar.ELLP , ellyChar.NDSH , ellyChar.MDSH ]: break if c == ',' and ll < lm - 1 and segm[ll+1] == ' ': break ll += 1 # print 'll=' , ll , ', lm=' , lm ll -= 1 while ll > 0: # exclude trailing non-alphanumeric from matching # except for '.' and '*' and bracketing c = segm[ll] if c in ellyWildcard.Trmls or c == '*' or ellyChar.isLetterOrDigit(c): break ll -= 1 # print 'limit=' , ll + 1 return ll + 1
def findSeparator(self, skip=0): """ scan for one of a list of separator chars in buffer arguments: self - skip - how many chars to skip in buffer to start scan returns: offset in buffer if char found, -1 otherwise """ n = len(self.buffer) if skip >= n: # is skip too long? return -1 # if so, fail if skip == 0 and self.buffer[0] == APO: return 1 # special case! # print ( 'skip=' , skip, 'n=' , n ) for k in range(skip, n): ck = self.buffer[k] if ck in separators: # is buffer char a separator? self.index = k # if so, note buffer position return k if ck == ellyChar.COM: # print ( 'comma k=' , k ) if k + 1 < n: ck1 = self.buffer[k + 1] if not ellyChar.isLetterOrDigit(ck1): if ck1 in ellyChar.Grk: return k return -1 # fail
def acronym ( buffr ): """ recognize parenthesized introduction of acronym in text arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ lb = len(buffr) if lb > Lmax: lb = Lmax if lb < Lmin or buffr[0] != '(': return 0 nu = 0 # uppercase count ib = 1 while ib < lb: bc = buffr[ib] ib += 1 if bc == ')': break if not ellyChar.isLetter(bc): return 0 if ellyChar.isUpperCaseLetter(bc): nu += 1 else: return 0 # must have enclosing ')' if ib < Lmin or ib - 2*nu > 0: return 0 if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0 return ib
def __init__ ( self , syms , fets=None , semantic=False ): """ initialization arguments: self - syms - symbol table fets - string representation of feature set semantic - flag for semantic features exceptions: FormatFailure on error """ if syms == None or fets == None: # special case generating zero feature set self.positive = ellyBits.EllyBits(symbolTable.FMAX) self.negative = ellyBits.EllyBits(symbolTable.FMAX) self.id = '' return segm = fets.lower() # print "features=",segm,"semantic=",semantic if segm == None or len(segm) < 3 or segm[0] != '[' or segm[-1] != ']': raise ellyException.FormatFailure elif segm[1] == ' ' or ellyChar.isLetterOrDigit(segm[1]) or segm[1] == '*': raise ellyException.FormatFailure else: self.id = segm[1] # print "id=",self.id fs = syms.getFeatureSet(segm[1:-1] , semantic) # print 'fs=' , str(fs[0]) , ',' , str(fs[1]) if fs == None: raise ellyException.FormatFailure self.positive , self.negative = fs
def bound(segm): """ get maximum limit on string for pattern matching (override this method if necessary) arguments: segm - text segment to match against returns: char count """ # print 'segm=' , segm lm = len(segm) # limit can be up to total length of text for matching ll = 0 while ll < lm: # look for first space in text segment if segm[ll] == ' ': break ll += 1 # print 'll=' , ll , ', lm=' , lm ll -= 1 while ll > 0: # exclude trailing non-alphanumeric from matching # except for '.', Unicode prime, or '*' and bracketing c = segm[ll] # print 'bound c=' , c , '{:x}'.format(ord(c)) if (c in Trmls or c in [u'*', u'\u2032', u'+'] or ellyChar.isLetterOrDigit(c)): break ll -= 1 # print 'limit=' , ll + 1 return ll + 1
def getRules(self, a): """ get appropriate macros for text with specified starting char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of unpacked macro rules to try out """ # print ( 'getRules(a=' , a , ')' ) if a == '': return [] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ls = self.index[k] # print ( 'index a=' , a , 'k=' , k ) ws = self.letWx if ellyChar.isLetter(a) else self.digWx uniqueAdd(ls, ws) uniqueAdd(ls, self.anyWx) elif ellyChar.isApostrophe(a): ls = self.apoWx else: ls = self.index[0] uniqueAdd(ls, self.anyWx) # print ( len(ls) , ' rules to check' ) return [r.unpack() for r in ls]
def bound ( segm ): """ get maximum limit on string for template matching (override this method if necessary) arguments: segm - text segment to match against returns: char count """ # print ( 'segm=' , segm ) lm = len(segm) # limit can be up to total length of text for matching ll = 0 while ll < lm: # look for first break in text segment c = segm[ll] if c in [ ellyChar.ELLP , ellyChar.NDSH , ellyChar.MDSH ]: break if c == ',' and ll < lm - 1 and segm[ll+1] == ' ': break ll += 1 # print ( 'll=' , ll , ', lm=' , lm ) ll -= 1 while ll > 0: # exclude trailing non-alphanumeric from matching # except for '.' and '*' and bracketing c = segm[ll] if c in ellyWildcard.Trmls or c == '*' or ellyChar.isLetterOrDigit(c): break ll -= 1 # print ( 'limit=' , ll + 1 ) return ll + 1
def getRules ( self , a ): """ get appropriate macros for text with specified starting char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of unpacked macro rules to try out """ # print 'getRules(a=' , a , ')' if a == '': return [ ] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ls = self.index[k] # print 'index a=' , a , 'k=' , k ws = self.letWx if ellyChar.isLetter(a) else self.digWx uniqueAdd(ls,ws) uniqueAdd(ls,self.anyWx) elif ellyChar.isApostrophe(a): ls = self.apoWx else: ls = self.index[0] uniqueAdd(ls,self.anyWx) # print len(ls) , ' rules to check' return [ r.unpack() for r in ls ]
def delimitKey ( t ): """ get part of term for vocabulary table indexing that ends in alphanumeric or is a single nonalphanumeric with special stripping of 'S at the end arguments: t - text string to scan returns: count of chars to put into search key """ ln = len(t) # number of chars in input text if ln == 0: return 0 n = t.find(' ') # find rough range of key for SQLite in text if n < 0: n = ln # if undivided by spaces, take everything n -= 1 # index of last char in range while n > 0: # scan input text backwards c = t[n] # check char for alphanumeric if ellyChar.isLetterOrDigit(c): # print 'n=' , n , 'c=' , c if n > 1: # check for 'S as special case! if ( c in [ 's' , 'S' ] and ellyChar.isApostrophe(t[n-1]) ): # print 'drop \'S from SQLite key' n -= 1 else: break else: break n -= 1 # continue scanning backwards return n + 1 # to get key length ending in alphanumeric
def load ( self , stb , defn ): """ get templates and user-defined word classes from input arguments: self - stb - Elly symbol table defn - Elly definition reader for classes and templates exceptions: TableFailure on error """ clss = [ ] # element classes while True: l = defn.readline() # next definition line if len(l) == 0: break # EOF check s = l.split(':=') # look for user-defined class if len(s) == 2: nme = s[0].strip() if len(nme) != 2 and not ellyChar.isLetterOrDigit(nme[1]): self._err('improper class ID') continue if nme in preClass: self._err('cannot change predefined classes') continue ls = s[1].split(',') # list of words for class ls = list(w.strip() for w in ls) # just in case of extra spaces # print 'for class, ls=' , ls if not nme in self.ucls: # define a new template category? self.ucls[nme] = [ ] self.ucls[nme].extend(ls) # add list of words to class # print 'class=' , self.ucls[nme] self.cfns[nme] = None else: tm = Template(stb,l) # create a new template if tm.check() > 0: # any problem here is fatal # print 'template error' self._errcount += 1 continue for elm in tm.lstg: # get unique template categories if elm[0] == Catg: if not elm in clss: clss.append(elm) self.tmpl.append(tm) # add template to table missg = [ ] # to collect missing definitions for cx in clss: # check user-defined categories if not cx in self.cfns: if cx[1] != '*': missg.append(cx) # note if unsupported by class list lm = len(missg) if lm > 0: # this is a FATAL error print >> sys.stderr , lm , 'undefined template categories=' , missg self._errcount += lm if self._errcount > 0: print >> sys.stderr , 'table error count=' , self._errcount raise ellyException.TableFailure('templates')
def load ( self , stb , defn ): """ get templates and user-defined word classes from input arguments: self - stb - Elly symbol table defn - Elly definition reader for classes and templates exceptions: TableFailure on error """ clss = [ ] # element classes while True: l = defn.readline() # next definition line if len(l) == 0: break # EOF check s = l.split(':=') # look for user-defined class if len(s) == 2: nme = s[0].strip() if len(nme) != 2 and not ellyChar.isLetterOrDigit(nme[1]): self._err('improper class ID') continue if nme in preClass: self._err('cannot change predefined classes') continue ls = s[1].split(',') # list of words for class ls = list(w.strip() for w in ls) # just in case of extra spaces # print ( 'for class, ls=' , ls ) if not nme in self.ucls: # define a new template category? self.ucls[nme] = [ ] self.ucls[nme].extend(ls) # add list of words to class # print ( 'class=' , self.ucls[nme] ) self.cfns[nme] = None else: tm = Template(stb,l) # create a new template if tm.check() > 0: # any problem here is fatal # print ( 'template error' ) self._errcount += 1 continue for elm in tm.lstg: # get unique template categories if elm[0] == Catg: if not elm in clss: clss.append(elm) self.tmpl.append(tm) # add template to table missg = [ ] # to collect missing definitions for cx in clss: # check user-defined categories if not cx in self.cfns: if cx[1] != '*': missg.append(cx) # note if unsupported by class list lm = len(missg) if lm > 0: # this is a FATAL error print ( lm , 'undefined template categories=' , missg , file=sys.stderr ) self._errcount += lm if self._errcount > 0: print ( 'table error count=' , self._errcount , file=sys.stderr ) raise ellyException.TableFailure('templates')
def __init__ ( self , syms , spec ): """ initialization from input string and symbol table arguments: self - syms - current symbol table spec - input string exceptions: FormatFailure on error """ self.catg = -1 # values to set on an error self.synf = None # # print >> sys.stderr , 'specification=' , spec if spec == None: print >> sys.stderr , '** null syntax specification' raise ellyException.FormatFailure s = spec.lower() # must be lower case for all lookups n = 0 for c in s: if not ellyChar.isLetterOrDigit(c) and c != '.': break n += 1 if n == 0: print >> sys.stderr , '** no syntactic category' raise ellyException.FormatFailure typs = s[:n] # save category name # print >> sys.stderr , 'catg=' , self.catg catg = syms.getSyntaxTypeIndexNumber(typs) s = s[n:].strip() # feature part of syntax if len(s) == 0: # check if there are any features synf = featureSpecification.FeatureSpecification(syms,None) if typs == '...': synf.id = '...' elif typs == '...': # ... category may have no features! raise ellyException.FormatFailure else: # decode features # print >> sys.stderr , 'syms=' , syms , 's=' , s if len(s) > 3 and typs in catid and catid[typs] != s[1]: print >> sys.stderr , '** type' , typs.upper() , 'has two feature IDs:' , catid[typs] , s[1] raise ellyException.FormatFailure catid[typs] = s[1] synf = featureSpecification.FeatureSpecification(syms,s) # FormatFailure exception may be raised above, but will not be caught here # print >> sys.stderr , 'success' self.catg = catg self.synf = synf
def get(self, ts, n=N): """ get normalized substring in lower case for subsequent comparisons arguments: self - ts - list of chars to get substring from n - limit on count of chars to get returns: count of chars scanned for substring """ sl = [] # char sublist to be matched # print ( 'ts=' , ts ) lts = len(ts) if lts == 0: return 0 # no chars to scan lm = lts if lts < n else n # print ( 'lm=' , lm ) i = 0 c = '' while i < lm: # scan input text up to char limit lc = c c = ts[i] # get next char if c == COMMA: # special treatment of COMMA # print ( 'comma' ) if (not ellyChar.isDigit(lc) or i + 3 >= lm or not ellyChar.isDigit(ts[i + 1]) or not ellyChar.isDigit(ts[i + 2]) or not ellyChar.isDigit(ts[i + 3])): break else: if not ellyChar.isLetterOrDigit(c): # stop if not letter if not c in ALSO: break # or "'" or "/" or "-" sl.append(c.lower()) # otherwise append to sublist i += 1 # print ( 'i=' , i , '<' + c + '>' ) if i < lm and ellyChar.isLetterOrDigit(ts[i]): # proper termination? return 0 # if not, reject substring self.string = ''.join(sl) return i # scan count
def getFeatureSet ( self , fs , ty=False ): """ get feature index associated with given name in given set arguments: self - fs - feature set without enclosing brackets ty - False=syntactic, True=semantic returns: list of EllyBits [ positive , negative ] on success, None on failure """ if len(fs) < 1: return None bp = ellyBits.EllyBits(FMAX) # all feature bits zeroed bn = ellyBits.EllyBits(FMAX) # fsx = self.sxindx if not ty else self.smindx # print '-------- fs=' , fs fid = fs[0] # feature set ID fnm = fs[1:].split(',') # feature names if not fid in fsx: # known ID? fsx[fid] = { } # if not, make it known h = fsx[fid] # for hashing of feature names if len(fnm) == 0: # check for empty features return [ bp , bn ] for nm in fnm: nm = nm.strip() if len(nm) == 0: continue if nm[0] == '-': # negative feature? b = bn # if so, look at negative bits nm = nm[1:] elif nm[0] == '+': # positive feature? b = bp # if so, look at positive bits nm = nm[1:] else: b = bp # positive bits by default # print '-------- nm=' , nm for c in nm: # check feature name if not ellyChar.isLetterOrDigit(c) and c != '*': return None if not nm in h: # new name in feature set? k = len(h) # if so, define it l = FMAX # limit for feature index if not ty: # adjustment for extra predefined k -= 3 # syntactic feature names *L and *R l -= 1 # and for *UNIQUE if k == l: # overflow check print >> sys.stderr, '+* too many feature names' return None h[nm] = k b.set(h[nm]) # set bit for feature return [ bp , bn ]
def delimitKey(t): """ get bounds of vocabulary table key for looking up a term starting at the front of a given text string with special stripping of 'S at the end arguments: t - text string to scan returns: count of chars to take for search key """ ln = len(t) # number of chars in input text if ln == 0: return 0 if not ellyChar.isLetterOrDigit(t[0]): return 1 # print ( 'delimitKey t=' , t ) k = t.find('-') # find rough range of SQLite key in text n = t.find(' ') # delimited by either a hyphen or a space if n < 0: n = ln # if space, take everything if k > 1 and n > k: n = k # hyphen delimits if it comes first n -= 1 # index of last char of candidate key # print ( 'k=' , k , 'n=' , n ) while n > 0: # scan input text backwards c = t[n] # check char for alphanumeric if ellyChar.isLetterOrDigit(c): # print ( 'n=' , n , 'c=' , c ) if n > 1: # check for 'S as special case! if (c in ['s', 'S'] and ellyChar.isApostrophe(t[n - 1])): # print ( 'drop \'S from SQLite key' ) n -= 1 else: break else: break n -= 1 # continue scanning backwards # print ( 'key=' , t[:n+1] ) return n + 1 # to get key length ending in alphanumeric
def delimitKey ( t ): """ get bounds of vocabulary table key for looking up a term starting at the front of a given text string with special stripping of 'S at the end arguments: t - text string to scan returns: count of chars to take for search key """ ln = len(t) # number of chars in input text if ln == 0: return 0 if not ellyChar.isLetterOrDigit(t[0]): return 1 # print 'delimitKey t=' , t k = t.find('-') # find rough range of SQLite key in text n = t.find(' ') # delimited by either a hyphen or a space if n < 0: n = ln # if space, take everything if k > 1 and n > k: n = k # hyphen delimits if it comes first n -= 1 # index of last char of candidate key # print 'k=' , k , 'n=' , n while n > 0: # scan input text backwards c = t[n] # check char for alphanumeric if ellyChar.isLetterOrDigit(c): # print 'n=' , n , 'c=' , c if n > 1: # check for 'S as special case! if ( c in [ 's' , 'S' ] and ellyChar.isApostrophe(t[n-1]) ): # print 'drop \'S from SQLite key' n -= 1 else: break else: break n -= 1 # continue scanning backwards # print 'key=' , t[:n+1] return n + 1 # to get key length ending in alphanumeric
def stateZip(buffr): """ recognize U.S. state abbreviation and zip code arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ if len(buffr) < 8 or buffr[2] != ' ': return 0 st = ''.join(buffr[:2]).upper() # expected 2-char state abbreviation if not st in ziprs: return 0 # if not known, quit zc = ziprs[st] # get zip-code start b = buffr[3:] # expected start of zipcode i = 0 for c in zc: # check starting digits of zipcode if c != b[i]: return 0 i += 1 while i < 5: # check for digits in rest of zipcode if not ellyChar.isDigit(b[i]): return 0 i += 1 b = b[5:] # look for proper termination if len(b) == 0: # if end of input, success return 8 # success: 5-digit zip c = b[0] if ellyChar.isLetterOrDigit(c): # if next char is alphanumeric, failure return 0 elif b[0] == '-': # look for possible 9-digit zip if len(b) > 5: b = b[1:] for i in range(4): if not ellyChar.isDigit(b[i]): return 0 # check for 4 more digits b = b[4:] # past end of 4 digits if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]): return 0 # termination check return 8 + 5 # success: 9-digit zip else: return 8 # success: 5-digit zip
def terminate(ss, sp, lss=None): """ check char for termination of match arguments: ss - char input stream sp - char position in stream returns: True if terminating char or past end of input, False otherwise """ if lss == None: lss = len(ss) return True if sp >= lss else not ellyChar.isLetterOrDigit(ss[sp])
def stateZip ( buffr ): """ recognize U.S. state abbreviation and zip code arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ if len(buffr) < 8 or buffr[2] != ' ': return 0 st = ''.join(buffr[:2]).upper() # expected 2-char state abbreviation if not st in ziprs: return 0 # if not known, quit zc = ziprs[st] # get zip-code start b = buffr[3:] # expected start of zipcode i = 0 for c in zc: # check starting digits of zipcode if c != b[i]: return 0 i += 1 while i < 5: # check for digits in rest of zipcode if not ellyChar.isDigit(b[i]): return 0 i += 1 b = b[5:] # look for proper termination if len(b) == 0: # if end of input, success return 8 # success: 5-digit zip c = b[0] if ellyChar.isLetterOrDigit(c): # if next char is alphanumeric, failure return 0 elif b[0] == '-': # look for possible 9-digit zip if len(b) > 5: b = b[1:] for i in range(4): if not ellyChar.isDigit(b[i]): return 0 # check for 4 more digits b = b[4:] # past end of 4 digits if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]): return 0 # termination check return 8 + 5 # success: 9-digit zip else: return 8 # success: 5-digit zip
def terminate ( ss , sp , lss=None ): """ check char for termination of match arguments: ss - char input stream sp - char position in stream returns: True if terminating char or past end of input, False otherwise """ if lss == None: lss = len(ss) return True if sp >= lss else not ellyChar.isLetterOrDigit(ss[sp])
def _terminate ( c ): """ check char for termination of match arguments: c - char to check returns: True if termination, False otherwise """ return not ellyChar.isLetterOrDigit(c)
def getConcept ( self , name ): """ get concept, creating if necessary in index under name arguments: self - name - of concept returns: concept for name other than NOname, otherwise None """ name = name.strip().upper() if name == NOname: return None elif name == TOP: return self.index[TOP] elif len(name) == 0 or not ellyChar.isLetterOrDigit(name[0]): return None elif name in self.index: return self.index[name] for x in name: if not ellyChar.isLetterOrDigit(x): return None c = Concept(name) self.index[name] = c return c
def getConcept(self, name): """ get concept, creating if necessary in index under name arguments: self - name - of concept returns: concept for name other than NOname, otherwise None """ name = name.strip().upper() if name == NOname: return None elif name == TOP: return self.index[TOP] elif len(name) == 0 or not ellyChar.isLetterOrDigit(name[0]): return None elif name in self.index: return self.index[name] for x in name: if not ellyChar.isLetterOrDigit(x): return None c = Concept(name) self.index[name] = c return c
def __init__ ( self , syms , fets=None , semantic=False ): """ initialization arguments: self - syms - symbol table fets - string representation of feature set semantic - flag for semantic features exceptions: FormatFailure on error """ if syms == None: # special case generating zero feature set self.positive = ellyBits.EllyBits(symbolTable.FMAX) self.negative = ellyBits.EllyBits(symbolTable.FMAX) return segm = fets.lower() if fets != None else '[?]' # print "features=",segm,"semantic=",semantic if segm == None or len(segm) < 3 or segm[0] != '[' or segm[-1] != ']': raise ellyException.FormatFailure elif segm[1] == ' ' or ellyChar.isLetterOrDigit(segm[1]): raise ellyException.FormatFailure else: self.id = segm[1] # print "id=",self.id fsindx = syms.sxindx if not semantic else syms.smindx if not self.id in fsindx: # print 'new feature set' d = { } # new dictionary of feature names if not semantic: d['*r'] = 0 # always define '*r' as syntactic feature d['*right'] = 0 # equivalent to '*r' d['*l'] = 1 # always define '*l' d['*left'] = 1 # equivalent to '*l' d['*unique'] = LAST # always define fsindx[self.id] = d # and save fs = syms.getFeatureSet(segm[1:-1] , semantic) if fs == None: raise ellyException.FormatFailure self.positive , self.negative = fs
def title ( buffr ): """ recognize double-quoted title in text arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ lb = len(buffr) if lb > Tmax: lb = Tmax if lb < Tmin: return 0 qm = buffr[0] if qm != aDQ and qm != lDQ: return 0 ib = 1 while ib < lb: bc = buffr[ib] ib += 1 if bc == rDQ: break if not ellyChar.isUpperCaseLetter(bc): return 0 while ib < lb: bc = buffr[ib] ib += 1 if bc == ' ': break if qm == aDQ: if bc == aDQ: break else: if bc == rDQ: break if bc in [ '!' , '?' ]: return 0 if bc == rDQ or bc == aDQ: break else: return 0 # must have enclosing rDQ or aDQ if ib < Tmin: return 0 if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0 return ib
def _findAMorPM ( self , ts ): """ look for AM or PM in time expression arguments: self - ts - char list returns: length of string match on success, 0 otherwise """ k = 0 # for match count lt = len(ts) # maximum match if lt < 2: # minimum number of chars for any match return 0 elif ts[k] == ' ': # skip over any leading space k += 1 # print 'find AM or PM in' , ts[k:] x = ts[k].lower() if x != 'a' and x != 'p': # first char in AM or PM return 0 k += 1 if lt == k: # end of input check return 0 if ts[k] == '.': # '.' is optional k += 1 if lt == k: # end of input check return 0 y = ts[k].lower() if y != 'm': # last char in AM or PM return 0 k += 1 if lt == k or not ellyChar.isLetterOrDigit(ts[k]): # check for break self._xm = x # save just 'a' or 'p' return k # return match count for success else: return 0 # for match failure
def toIndex ( t ): """ get part of term for vocabulary table indexing that ends in alphanumeric or is a single nonalphanumeric arguments: t - term as string returns: count of chars to index """ ln = len(t) # number of chars in term if ln == 0: return 0 n = t.find(' ') # find first part of term if n < 0: n = ln # if indivisible, take everything n -= 1 # find last alphanumeric chars of first part while n > 0 and not ellyChar.isLetterOrDigit(t[n]): n -= 1 return n + 1
def getRules ( self , a ): """ get appropriate macros for text starting with specified first char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of macro rules to try out """ if a == '': return [ ] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ws = self.letWx if ellyChar.isLetter(a) else self.digWx ls = self.index[k] + ws + self.anyWx else: ls = self.index[0] + self.anyWx return ls
def scan(strg): """ check for extent of syntax specification arguments: strg - string of chars to scan returns: char count > 0 on finding possible syntax specification, 0 otherwise """ n = 0 for c in strg: if c == '.' or ellyChar.isLetterOrDigit(c): n += 1 else: break else: return n c = strg[n] if c == ' ': return n if c != '[': return 0 k = featureSpecification.scan(strg[n:]) return n + k if k > 0 else 0
def terminate(spc, npc): """ check char for termination of match range arguments: spc - current char in stream npc - next char in stream returns: True if current char terminates, False otherwise """ # print ( "terminate:" , '<' + spc + '>' , '<' + npc + '>' ) tm = False if spc in EMBs: if npc in EMBs: tm = True elif spc in APOs or ellyChar.isLetterOrDigit(spc): pass else: tm = True # print ( 'tm=' , tm ) return tm
def scan ( strg ): """ check for extent of syntax specification arguments: strg - string of chars to scan returns: char count > 0 on finding possible syntax specification, 0 otherwise """ n = 0 for c in strg: if c == '.' or ellyChar.isLetterOrDigit(c): n += 1 else: break else: return n c = strg[n] if c == ' ': return n if c != '[': return 0 k = featureSpecification.scan(strg[n:]) return n + k if k > 0 else 0
def _scanText ( self , k ): """ try to match in buffer regardless of word boundaries using Elly vocabulary and pattern tables and also running Elly entity extractors arguments: self - k - length of first component in buffer returns: vocabulary table record """ sb = self.sbu.buffer # input buffer tr = self.ptr # parse tree for results rs = drs # initialize to empty vocabulary table record rs.mtchl = 0 # maximum match count lm = len(sb) # scan limit #* print 'next component=' , sb[:k] , ', context=' , sb[k:lm] vrs = drs # initially, set no maximum match if self.vtb != None: # look in external dictionary first, if it exists if k > 1: # is first component a single char? ks = k # if not, use this for indexing else: ks = 1 # otherwise, add on any following alphanumeric while ks < lm: # if not ellyChar.isLetterOrDigit(sb[ks]): break ks += 1 ss = u''.join(sb[:ks]) # where to start for indexing n = vocabularyTable.toIndex(ss) # get actual indexing vs = self.vtb.lookUp(sb,n) # get list of the longest matches if len(vs) > 0: # r = vs[0][1] # if any matches, look at first m = r.mtchl # all other nominal lengths must be the same! #* print len(vs) , 'matching vocabulary entries' for v in vs: ve = v[0] # get vocabulary entry vrs = v[1] # result record for match # print 've=' , ve # if ve.gen != None: print ve.gen if tr.addLiteralPhraseWithSemantics( ve.cat,ve.syf,ve.smf,ve.bia,ve.gen): tr.lastph.lens = m # set char length of leaf phrase node # just added for later selection tr.lastph.cncp = ve.con rs.mtchl = m # update maximum for new matches #* print 'vocabulary m=' , rs.mtchl d = self.rul # grammar rule definitions m = d.ptb.match(sb,tr) # try entity by pattern match next #* print 'pattern m=' , m if rs.mtchl < m: rs.mtchl = m # on longer match, update maximum m = self.iex.run(sb) # try entity extractors next #* print 'extractor m=' , m if rs.mtchl < m: rs.mtchl = m # on longer match, update maximum #* print 'maximum match=' , rs.mtchl # print 'input=' , self.sbu.buffer if rs.mtchl > 0: # any matches at all? nd = tr.requeue() # if so, keep only longest of them # print nd , 'phrases dropped by requeue()' if vrs.mtchl == rs.mtchl: # this a vocabulary match? rs = vrs # if so, use vocabulary match results return rs
def match ( patn , text , offs=0 , limt=None ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit of matching returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # three private functions using local variables of match() # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi return uf def _span ( typw ): """ count chars available for wildcard match arguments: typw - wildcard returns: non-negative count if any match possible, otherwise -1 """ k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"@",offs # calculate maximum chars a wildcard can match mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # char type matching a wildcard # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match? if limt == None: limt = len(text) mp = 0 # pattern index ml = len(patn) # pattern match limit # print text[offs:limt],":",list(patn) while True: ## literally match as many next chars as possible while mp < ml: if offs >= limt: last = '' else: last = text[offs].lower() offs += 1 # print 'matching last=' , last , 'at' , offs if patn[mp] != last: break mp += 1 ## check whether mismatch is due to special pattern char # print 'pat',mp,"<",ml # print "txt @",offs if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",ord(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs uf = _mark(1); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last in [ '.' , ',' , '-' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:" if last != '' and ellyChar.isWhiteSpace(last): _bind(); _modify(); mbi += 1 continue elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: if last != '': # still more to match? offs -= 1 nm = _span(tc) # maximum match possible # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1); unj += 1 uf.count = nm - 1 # at least one char must be matched continue elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch # print "fail - unwinding",unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard break else: # print "no unwinding" break # quit if unwinding is exhausted ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating" # print "bd:",len(mbd) # for b in mbd: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # for b in mbd: # print b return mbd # consolidated bindings plus new offset
def _scanText(self, k): """ try to match in buffer regardless of word boundaries using Elly vocabulary and pattern tables and also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , match types , vocabulary match chars , suffix removed ] """ # print ( '_scanText k=' , k ) sb = self.sbu.buffer # input buffer # match status nspan = 0 # total span of match mtype = '' # no match type yet vmchs = [] # chars of vocabulary entry matched suffx = '' # any suffix removed in match lm = len(sb) # scan limit # print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] ) if self.vtb != None: # look in external dictionary first, if it exists if k > 1: # is first component a single char? ks = k # if not, use this for indexing else: ks = 1 # otherwise, add on any following alphanumeric while ks < lm: # if not ellyChar.isLetterOrDigit(sb[ks]): break ks += 1 ss = ''.join(sb[:ks]) # where to start for indexing # print ( 'ss=' , ss ) n = vocabularyTable.delimitKey(ss) # get actual indexing # print ( 'n=' , n ) rl = self.vtb.lookUp(sb, n) # get list of the longest matches if len(rl) > 0: # # print ( 'len(rl)=' , len(rl) ) r0 = rl[0] # look at first record nspan = r0.nspan # should be same for all matches mtype = 'Vt' vmchs = r0.vem.chs # suffx = r0.suffx # # print ( 'vocabulary m=' , nspan ) d = self.rul # grammar rule definitions m = d.ptb.match(sb, self.ptr) # try entity by pattern match next # print ( 'pattern m=' , m ) if nspan < m: nspan = m # on longer match, update maximum mtype = 'Fa' elif m > 0 and nspan == m: mtype = 'VtFa' # print ( 'mtype=' , mtype ) m = self.iex.run(sb) # try entity extractors next # print ( 'extractor m=' , m ) if nspan < m: nspan = m # on longer match, update maximum mtype = 'Ee' elif m > 0 and nspan == m: mtype += 'Ee' # unchanged match length, add type # print ( 'maximum match=' , nspan ) # print ( 'mtype=' , mtype ) # print ( 'input=' , self.sbu.buffer[:nspan] ) return [nspan, mtype, vmchs, suffx]
def getNext(self): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print ( 'getNext' ) self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print ( 'x=' , '<' + x + '>' , ord(x) ) self.inp.unread(x, SP) # put first char back to restore input # print ( '0 <<' , self.inp.buf ) # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' ) # print ( 'sent=' , sent , 'nspc=' , nspc ) # check for table delimiters in text if len(sent) == 0: # print ( 'table' ) # print ( '1 <<' , self.inp.buf ) if x == '.' or x == '-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' ) if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing( x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt ) sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # certain Unicode punctuation will always break if c in Hards: break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print ( '0 <<' , self.inp.buf ) # print ( 'sent=' , sent[:-1] ) # print ( 'punc=' , '<' + c + '>' ) # print ( 'next=' , cx ) if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1], c, cx): # print ( 'stop exception MATCH' ) if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print ( 'no stop exception MATCH for' , c ) # print ( '@1 <<' , self.inp.buf ) # handle any nonstandard punctuation exoticPunctuation.normalize(c, self.inp) # print ( '@2 <<' , self.inp.buf ) # check for dash if c == '-': d = self.inp.read() if d == '-': # print ( 'dash' ) while True: d = self.inp.read() if d != '-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print ( '@3 c=' , c , inBrkt ) if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) ) if not inBrkt: # print ( sent , 'so far' ) z = self.inp.read() if self.shortBracketing(sent, z): break self.inp.unread(z) # print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' ) if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print ( 'stop+quote' ) z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print ( 'stop+quote+quote' ) sent.append(z) inBrkt = False break self.inp.unread(z) # print ( 'continue' ) continue elif not c in Stops: continue else: # print ( 'check stopping!' ) d = self.inp.read() # print ( '@3 <<' , self.inp.buf ) if d == None: d = '!' # print ( 'stop=' , '<' + c + '> <' + d + '>' ) # print ( 'ellipsis check' ) if c == '.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append( SP ) # if part of token, put in space as separator continue if c == ELLP: # print ( 'found Unicode ellipsis, d=' , d ) if ellyChar.isUpperCaseLetter(d): self.inp.unread( d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print ( 'next char d=' , d , ord(d) if d != END else 'NONE' ) if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent, d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print ( 'no space after punc' ) continue # if no match for lookahead, put back elif d != END: # print ( 'unread d=' , d ) self.inp.unread(d) # print ( 'possible stop' ) # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print ( 'sent=' , sent ) # print ( 'ixn=' ,ixn ) while ixn > 0: ixn -= 1 cxn = sent[ixn] # print ( 'cxn=' , cxn ) if not ellyChar.isDigit(cxn): break # print ( 'break: ixn=' , ixn , 'ixb=' , ixb ) if ixn < ixb and cxn in [' ', '-', '+']: prvw = self.inp.preview() # print ( 'prvw=' , prvw ) if len(prvw) > 1 and not ellyChar.isUpperCaseLetter( prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() ) # print ( 'nspc=' , nspc ) if c in [':', ';'] or nspc < 3: sent.append(d) # print ( 'add' , '<' + d + '> to sentence' ) # print ( 'sent=' , sent ) self.inp.skip() nspc -= 1 continue # print ( '@4 <<' , self.inp.buf ) cx = self.inp.peek() if cx == None: cx = '!!' # print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent ) # print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt ) if nAN > 1: break if sent == ['\u2026']: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def __init__ ( self , syms , spec ): """ initialization from input string and symbol table arguments: self - syms - current symbol table spec - input string exceptions: FormatFailure on error """ self.catg = -1 # values to set on an error self.synf = None # # print >> sys.stderr , 'specification=' , spec if spec == None: print >> sys.stderr , '** null syntax specification' raise ellyException.FormatFailure s = spec.lower() # must be lower case for all lookups n = 0 for c in s: if not ellyChar.isLetterOrDigit(c) and c != '.': break n += 1 if n == 0: print >> sys.stderr , '** no syntactic category' raise ellyException.FormatFailure typs = s[:n] # save category name # print >> sys.stderr , 'catg=' , self.catg catg = syms.getSyntaxTypeIndexNumber(typs) if catg == None: raise ellyException.FormatFailure s = s[n:].strip() # feature part of syntax if len(s) == 0: # check if there are any features synf = featureSpecification.FeatureSpecification(syms,None) if typs == '...': synf.id = '...' elif typs == '...': # ... category may have no features! raise ellyException.FormatFailure else: # decode features # print >> sys.stderr , 'syms=' , syms , 's=' , s if len(s) < 4: print >> sys.stderr , '** bad syntactic type or missing features= ' , typs+s raise ellyException.FormatFailure if typs in catid and catid[typs] != s[1]: print >> sys.stderr , '** type' , typs.upper() , 'has two feature IDs:' , catid[typs] , s[1] raise ellyException.FormatFailure catid[typs] = s[1] synf = featureSpecification.FeatureSpecification(syms,s) # FormatFailure exception may be raised above, but will not be caught here # print >> sys.stderr , 'success' self.catg = catg self.synf = synf
def match ( self , txt , pnc , nxt ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars up to and including punctuation char pnc - punctuation char nxt - single char after punctuation returns: True on match, False otherwise """ # print 'matching for txt=' , txt , 'pnc=' , pnc , 'nxt=' , nxt # print 'lstg=' , self.lstg if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print len(lp) , 'patterns' txl = txt[-self.maxl:] if len(txt) > self.maxl else txt txs = map(lambda x: x.lower(),txl) # actual left context for matching lt = len(txs) # its length # print 'txs= ' + unicode(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' for p in lp: # try matching each pattern if p.left != None: n = len(p.left) # assume each pattern element must match one sequence char # print 'n=' , n , 'p=' , unicode(p) if n > lt: continue # fail immediately because of impossibility of match t = txs if n == lt else txs[-n:] # print 'left pat=' , '[' + ellyWildcard.deconvert(p.left) + ']' # print 'versus t=' , t if not ellyWildcard.match(p.left,t,0): # print 'no left match' continue if n < lt and ellyChar.isLetterOrDigit(t[0]): if ellyChar.isLetterOrDigit(txs[-n-1]): continue # fail because of no break in text # nc = '\\n' if nxt == '\n' else nxt # print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # print 'versus c=' , nc if p.right == []: return True pcx = p.right[0] if pcx == nxt: # check for specific char after possible stop # print 'right=' , nxt return True if pcx == ellyWildcard.cCAN: # check for nonalphanumeric if nxt == u'' or not ellyChar.isLetterOrDigit(nxt): # print 'right nonalphanumeric=' , nxt return True if pcx == ellyWildcard.cSPC: # check for white space # print 'looking for space' if nxt == u'' or nxt == u' ' or nxt == u'\n': # print 'right space' return True # print 'last check' if p.right == u'.': # check for any punctuation if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt): # print 'right punc=' , nxt return True return False
def _store ( self , defs , nowarn ): """ put macro substitutions into table with indexing by first char of pattern arguments: self - defs - list of macro definition as strings nowarn - whether to turn warnings off exceptions: TableFailure on error """ while True: l = defs.readline() # next macro rule # print "rule input=" , l if len(l) == 0: break # EOF check dl = definitionLine.DefinitionLine(l,False) left = dl.left # pattern to be matched tail = dl.tail # transformation to apply to match if left == None or tail == None: self._err(l=l) continue mp = ellyWildcard.convert(left) if mp == None: self._err('bad wildcards',l) continue pe = mp[-1] if pe != ellyWildcard.cALL and pe != ellyWildcard.cEND: mp += ellyWildcard.cEND # pattern must end in $ if it does not end in * if not _checkBindings(mp,tail): self._err('bad bindings in substitution',l) continue if not nowarn and not _checkExpansion(mp,tail): self._err('substitution longer than original string',l,0) r = [ mp , tail ] # print "rule =" , [ left , tail ] pat = r[0] # get coded pattern if pat == None: self._err('no pattern',l) continue c = pat[0] # first char of pattern # check type to see how to index rule # print 'c=' , ord(c) p = pat while c == ellyWildcard.cSOS: # optional sequence? k = p.find(ellyWildcard.cEOS) # if so, find the end of sequence if k < 0 or k == 1: break # if no end or empty sequence, stop k += 1 if k == len(pat): break # should be something after sequence m = ellyChar.toIndex(pat[1]) # index by first char of optional sequence self.index[m].append(r) # (must be non-wildcard) p = p[k:] # move up in pattern c = p[0] # but check for another optional sequence if c == ellyWildcard.cSOS: self._err(l=l) continue # bad sequence, skip this rule # print 'c=' , ord(c) if ellyChar.isLetterOrDigit(c): # check effective first char of pattern m = ellyChar.toIndex(c) self.index[m].append(r) # add to index under alphanumeric char elif ellyChar.isText(c): self.index[0].append(r) # add to index under punctuation elif not c in ellyWildcard.Matching: if c == ellyWildcard.cEND: print >> sys.stderr , '** macro warning: pattern can have empty match' print >> sys.stderr , '* at [' , l , ']' else: dc = '=' + str(ord(c) - ellyWildcard.X) self._err('bad wildcard code' , dc) continue elif c == ellyWildcard.cANY or c == ellyWildcard.cALL: self.anyWx.append(r) # under general wildcards elif c == ellyWildcard.cCAN: self.index[0].append(r) # under punctuation elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG: self.digWx.append(r) # under digit wildcards elif c == ellyWildcard.cSAN: self.digWx.append(r) # under both digit and self.letWx.append(r) # letter wildcards elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND: self._err('bad wildcard in context',l) continue # wildcards unacceptable here else: self.letWx.append(r) # everything else under letter wildcard self.count += 1 # count up macro substitution if self._errcount > 0: print >> sys.stderr , '**' , self._errcount , 'macro errors in all' print >> sys.stderr , 'macro table definition FAILed' raise ellyException.TableFailure
def _getRaw(self): """ obtain next raw token from buffer arguments: self returns: EllyToken on success, None otherwise """ # print ( '_getRaw() from' , len(self.buffer) , 'chars' ) # print ( 'before skipping spaces, buffer=' , self.buffer ) self.skipSpaces() ln = len(self.buffer) # print ( "after skip=",ln ) if ln == 0: return None ## get length of next token and if it has ## initial - or +, check for word fragment # print ( 'buffer start=' , self.buffer[0] ) k = 0 # number of chars for next token cz = ' ' if ln == 0 else self.buffer[0] if cz in [MIN, PLS]: k = self.findSeparator(1) elif cz == APO: if ln > 2 and self.buffer[1].lower( ) == 's' and self.buffer[2] in separators: k = 2 else: k = 1 elif cz in [COM, DOT, UELP]: # these can be tokens by themselves k = 1 else: # print ( 'full token extraction' ) k = self.findSeparator() # print ( 'k=' , k , 'ln=' , ln ) if k < 0: # break multi-char token at next separator k = ln # if no separator, go up to end of buffer elif k == 0: k = 1 # immediate break in scanning else: while k < ln: # look at any separator and following context x = self.buffer[k] if x != MIN and x != COM: break # no further check if separator not hyphen or comma if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k + 1]): # print ( 'x=' , x , 'buf=' , self.buffer[k:] ) break # accept hyphen or comma if NOT followed by digit else: # otherwise, look for another separator k = self.findSeparator(k + 2) if k < 0: # k = ln ## if token not delimited, take rest of buffer as ## will fit into token working area if k < 0: k = ln # print ( "take",k,"chars from",len(self.buffer),self.buffer ) buf = self.extract(k) # get k characters ## special check for hyphen next in buffer after extraction if self.match(MIN): # hyphen immediately following? self.skip() # if so, take it if self.atSpace(): # when followed by space buf.append(MIN) # append hyphen to candidate token k += 1 else: if not self.match(MIN): # when not followed by another hyphen self.prepend(ellyChar.SPC) # put back a space else: self.skip() # double hyphen = dash self.prepend(ellyChar.SPC) # put back space after dash self.prepend(MIN) # put back second hyphen self.prepend(MIN) # put back first self.prepend( ellyChar.SPC) # put extra space before hyphen or dash ## fill preallocated token for current position from working area # print ( "raw text buf=" , buf ) to = ellyToken.EllyToken(''.join(buf)) # print ( "EllyBuffer token before=" , str(to) ) ## strip off trailing non-token chars from token and put back in buffer km = k - 1 while km > 0: x = buf[km] if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS: break # print ( 'trailing x=' , x ) if x == APO or x == APX: if km > 0 and buf[km - 1] == 's': break self.prepend(x) km -= 1 km += 1 if km < k: to.shortenBy(k - km, both=True) # print ( "EllyBuffer token=" , strx(to) ) # print ( "next in buffer=" , self.buffer ) return to
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __str__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + str(self.kind) + ',ct=' + str(self.count) + ',pa=' + str(self.pats) + ',tx=' + str(self.txts) + ',bd=' + str(self.bnds) + ',ns=' + str(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print ( "binding:",offs,ns ) os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) ) # print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp ) # print ( "text to span:",text[offs:] ) # print ( "pat rest=" , patn[mp:] ) k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print ( "exclude=",k,"chars from possible span for rest of pattern" ) # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print ( mx,"chars available to scan" ) mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print ( "text at",offs,"maximum wildcard match=",mx ) nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print ( 'span c=' , c ) if not tfn(c): break # stop when it fails to match nm += 1 # print ( "maximum wildcard span=",nm ) return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print ( 'starting match, limt=',limt,text[offs:limt],":",patn ) # print ( 'nsps=' , nsps ) mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print ( '---- loop mp=' , mp , 'ml=' , ml ) while mp < ml: if offs >= limt: # print ( "offs=",offs,"limt=",limt ) last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print ( 'patn=' , patn ) mc = patn[mp] # print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs ) # print ( 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' ) if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print ( 'hyphen special matching, limt=', limt , 'offs=' , offs ) # print ( 'text[offs:]=' , text[offs:] ) if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print ( 'no special matching of hyphen' ) break # print ( 'matched @mp=' , mp ) mp += 1 ## check whether mismatch is due to special pattern char # print ( 'pat @',mp,"<",ml ) # print ( "txt @",offs,'<',limt,'last=',last ) # print ( '@',offs,text[offs:] ) if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) ) if tc == cALL: # a * wildcard? # print ( "ALL last=< " + last + " >" ) if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print ( "offs=",offs,'nm=',nm ) uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print ( "END $:",last ) if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif not ellyChar.isText(last): offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print ( "ANY:",last,offs ) if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print ( 'at cCAN' ) if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print ( "ALF:",last,offs ) if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print ( "UPR:",last,'@',offs ) if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print ( "LWR:",last,'@',offs ) if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print ( "SPC:","["+last+"]" ) if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print ( 'NO space' ) elif tc == cAPO: # apostrophe wildcard? # print ( "APO: last=" , last ) if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print ( "SOS" ) # print ( last,'@',offs ) mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print ( "EOS" ) if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' ) if last != '': # still more to match? offs -= 1 # print ( 'nsps=' , nsps ) # print ( '@' , offs , text ) nm = _span(tc,nsps) # maximum match possible # print ( 'spanning=' , nm ) if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print ( 'spanning=' , nm ) if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print ( 'offs=' , offs ) last = text[offs] if offs < limt else '' continue # print ( 'fail tc=' , deconvert(tc) ) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print ( "fail - unwinding" , unj ) while unj > 0: # try unwinding, if possible # print ( "unw:",unj ) uf = unw[unj-1] # get most recent unwinding record # print ( uf ) if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print ( "no unwinding" ) break # quit if unwinding is exhausted # print ( 'cnt=' , uf.count , 'off=' , offs ) ## ## clean up on match mode or on no match possible ## # print ( "matched=",matched ) if not matched: return None # no bindings # print ( text,offs ) ## consolidate contiguous bindings for subsequent substitutions # print ( "BEFORE consolidating consecutive bindings" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print ( "AFTER" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) return mbd # consolidated bindings plus new offset
def getFeatureSet ( self , fs , ty=False ): """ get feature indices associated with given names in given set arguments: self - fs - feature set without enclosing brackets ty - False=syntactic, True=semantic returns: list of EllyBits [ positive , negative ] on success, None on failure """ if len(fs) < 1: return None # print ( 'fs=' , fs ) bp = ellyBits.EllyBits(FMAX) # all feature bits zeroed bn = ellyBits.EllyBits(FMAX) # fsx = self.smindx if ty else self.sxindx # print ( '-------- fs=' , fs ) fid = fs[0] # feature set ID fnm = fs[1:].split(',') # feature names if not fid in fsx: # known ID? # print ( 'new feature set' ) d = { } # new dictionary of feature names if ty: d['*c'] = 0 # always define '*c' as semantic feature d['*capital'] = 0 # equivalent to '*c' else: d['*r'] = 0 # always define '*r' as syntactic feature d['*right'] = 0 # equivalent to '*r' d['*l'] = 1 # always define '*l' d['*left'] = 1 # equivalent to '*l' d['*x'] = LAST # always define '*x' d['*u'] = LAST # always define '*u' d['*unique'] = LAST # equivalent to '*u' and '*x' fsx[fid] = d # make new feature set known h = fsx[fid] # for hashing of feature names if len(fnm) == 0: # check for empty features return [ bp , bn ] for nm in fnm: nm = nm.strip() if len(nm) == 0: continue if nm[0] == '-': # negative feature? b = bn # if so, look at negative bits nm = nm[1:] elif nm[0] == '+': # positive feature? b = bp # if so, look at positive bits nm = nm[1:] else: b = bp # positive bits by default # print ( '-------- nm=' , nm ) nmc = nm if nm[0] != '*' else nm[1:] for c in nmc: # check feature name chars if not ellyChar.isLetterOrDigit(c): print ( 'bad feature name=' , nm , file=sys.stderr ) return None if not nm in h: # new name in feature set? if nm[0] == '*': # user cannot define reserved name print ( 'unknown reserved feature=' , nm , file=sys.stderr ) return None # print ( 'define new feature' ) k = len(h) # yes, this will be next free index l = FMAX # upper limit on feature index if ty: # semantic feature? k -= 1 # if so, adjust for extra name *C else: k -= 5 # else, adjust for *UNIQUE and extra names *L, *R , *U , *X l -= 1 # adjust upper limit for *UNIQUE if k == l: # overflow check print ( '** ERROR: too many feature names, fid=',fid,'nm=',nm , file=sys.stderr ) print ( '**' , end=' ' , file=sys.stderr ) print ( h.keys() , file=sys.stderr ) return None if k < 0: print ( 'bad index=' , k , 'l=' , l , file=sys.stderr ) return None h[nm] = k # define new feature # print ( 'k=' , k ) # print ( 'set bit' , h[nm] , 'for' , fid + nm ) b.set(h[nm]) # set bit for feature return [ bp , bn ]
def match(self, segm, tree): """ compare text segment against all FSA patterns from state 0 arguments: self - segm - segment to match against tree - parse tree in which to put leaf nodes for final matches returns: text length matched by FSA """ # print 'comparing' , segm if len(self.indx) == 0: return 0 # no matches if FSA is empty if len(segm) == 0: return 0 # string is empty lim = bound(segm) # get text limit for matching mtl = 0 # accumulated match length mtls = 0 # saved final match length state = 0 # set to mandatory initial state for FSA stk = [] # for tracking possible multiple matches ls = self.indx[state] # for state 0! ix = 0 # index into current possible transitions sg = segm[:lim] # text subsegment for matching # print 'initial sg=' , sg # print len(ls) , 'transitions from state 0' capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0]) while True: # run FSA to find all possible matches # print 'state=' , state # print 'count=' , mtl , 'matched so far' # print 'links=' , len(ls) , 'ix=' , ix nls = len(ls) # how many links from current state if ix == nls: # if none, then must back up if len(stk) == 0: break r = stk.pop() # restore match status # print 'pop r= [' , r[0] , r[1][0].shortcode() , ']' state = r[0] # FSA state ls = r[1] # remaining links to check sg = r[2] # input string mtl = r[3] # total match length ix = 0 # print 'pop sg=' , sg continue # print 'substring to match, sg=' , sg , 'nls=' , nls m = 0 while ix < nls: lk = ls[ix] # get next link at current state ix += 1 # and increment link index # print '@' , state , 'lk= [' , unicode(lk), ']' , 'ix=' , ix # print 'patn=' , lk.patn po = lk.patn[0] if po == u'\x00': # do state change without matching? m = 0 # no match length elif po != ellyWildcard.cEND: # print 'po=' , po bds = ellyWildcard.match(lk.patn, sg) # print 'bds=' , bds if bds == None: continue m = bds[0] # get match length, ignore wildcard bindings elif (len(sg) > 0 and (ellyChar.isLetterOrDigit(sg[0]) or sg[0] == ellyChar.PRME)): # print 'unmatched solitary $' continue else: # print 'matched solitary $, state=' , state m = 0 # print 'm=' , m if lk.nxts < 0: # final state? if lk.nxts == -2: m = 0 # last part of match not counted # print 'state=' , state , unicode(lk) # print 'flags=' , lk.synf , '/' , lk.semf if tree.addLiteralPhraseWithSemantics( lk.catg, lk.synf, lk.semf, lk.bias, cap=capd): # make phrase ml = mtl + m if mtls < ml: mtls = ml # print 'success!' , 'mtls=' , mtls tree.lastph.lens = mtls # save its length # print 'match state=' , state , 'length=' , mtls # print 'ix=' , ix , 'nls=' , nls if ix < nls: # any links not yet checked? r = [state, ls[ix:], sg, mtl] # print 'saved r= ' , state , # print [ x.shortcode() for x in ls[ix:] ] stk.append(r) # if not, save info for later continuation mtl += m # update match length break # leave loop at this state, go to next state else: # print 'no matches' continue # all patterns exhausted for state ix = 0 sg = sg[m:] # move up in text input state = lk.nxts # next state if state < 0: ls = [] else: ls = self.indx[state] # print 'sg=' , sg # print 'state=' , state # print 'len(ls)=' , len(ls) # print 'mtls=' , mtls return mtls
def rewrite ( self , ts ): """ check for date at current text position and rewrite if found arguments: self - ts - text stream as list of chars returns: True on any rewriting, False otherwise """ lts = len(ts) if lts < Lm: return False tz = self._tz # default self._xm = '' # default self._m = u'00' # defaults self._s = u'00' c = ts[0] # first char if not ellyChar.isDigit(c): return False # time can never start with a letter # because of number transforms k = self._matchN(ts) # print 'match numeric=' , k if k == 0: return False # print 'ts[k:]=' , ts[k:] k += self._findAMorPM(ts[k:]) # print 'AM or PM k=' , k # print 'hour=' , self._hr if self._xm == 'p' and self._hr < 12: # convert to 24-hour time self._hr += 12 elif self._xm == 'a' and self._hr == 12: # self._hr = 0 # print 'hour=' , self._hr t = ts[k:] # remainder of text # print 'rest t=' , t dk = 0 # skip count ns = 0 # space count if len(t) > 0: # look for time zone if t[0] == ' ': # skip any initial space dk += 1 ns = 1 # print 't[dk:]=' , t[dk:] , 'dk=' , dk dk += self.get(t[dk:]) # extract next token from input ss = self.string # # print 'zone=' , ss if ss in Zn: # match to known time zone? tz = ss elif ns == 0 and ss == u'z': # military ZULU time tz = u'gmt' # translate else: dk = 0 # no match k += dk # update match count t = t[dk:] # advance scan # print 't=' , t if len(t) > 0 and ellyChar.isLetterOrDigit(t[0]): return False for _ in range(k): # strip matched substring to be rewritten ts.pop(0) r = str(self._hr).zfill(2) + u':' + self._m + u':' + self._s + tz rr = r[::-1] for c in rr: # do rewriting ts.insert(0,c) self._rwl = len(r) return True
def match(self, txt, pnc, ctx): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx ) if matchtoo(txt, pnc, ctx): # exception by complex match? return True # print ( 'matchtoo() returned False' ) sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print ( 'lstg=' , self.lstg.keys() ) if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print ( len(lp) , 'patterns' ) ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit( c) and not ellyChar.isEmbeddedCombining(c): # print ( 'break at nrg=' , nrg , txt[-nrg] ) break nrg += 1 nrg -= 1 # end of range for all pattern matching # print ( 'ntr=' , ntr , 'nrg=' , nrg ) txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' ) for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len( pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print ( 'pattern ending with *' ) n -= 1 # print ( 'ltx=' , ltx , 'n=' , n ) if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat, t, 0): # print ( 'no possible pattern match' ) continue k = ltx - n # extra chars beyond any match # print ( 'k=' , k , 't=' , t ) # print ( 'txt=' , txt ) # print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' ) # print ( 'matches' , n , 'chars' ) if not star and k > 0: # print ( 'check text before [' , txt[-n] , ']' ) if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n - 1] # print ( 'preceding= [', c , ']' ) if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print ( 'pat=' , ellyWildcard.deconvert(p.left) ) # print ( 'n=' , n , 'ltx=' , ltx ) # print ( 'txt=' , txt ) # nc = '\\n' if nxt == '\n' else nxt # print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' ) # print ( 'versus c=' , nc ) rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop ) # print ( 'right=' , nxt ) return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print ( 'right is numeric=' , nxt 0 return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True # print ( "no matches" ) return False
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' self.resetBracketing() sent = [ ] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print 'x=' , '<' + x + '>' , ord(x) self.inp.unread(x,SP) # put first char back to restore input # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' # print 'sent=' , sent # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far ######################################### # accumulate chars and count alphanumeric ######################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # inBrkt = self.checkBracketing(x) # do bracket checking with modified chars # print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt sent.append(c) # but buffer original chars if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception z = self.inp.peek() # for context of match call # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , '<' + z + '>' if c in Stops and self.stpx.match(sent[:-1],c,z): # print 'exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print 'no stop exception MATCH for' , c # print '@1 <<' , self.inp.buf # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '@2 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print '@3 c=' , c if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) if not inBrkt: # print sent , 'so far' z = self.inp.read() if self.shortBracketing(sent,z): break self.inp.unread(z) # print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break continue elif not c in Stops or inBrkt: continue else: # print 'check stopping!' d = self.inp.read() # print '@3 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(c) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(SP) # if part of token, put in space as separator continue # special check for multiple stops # print 'next char d=' , d , ord(d) if d != END else 'NONE' if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent,d): break self.inp.unread(d) # print 'no space after punc' continue # if no match for lookahead, put back elif d != END: # print 'unread d=' , d self.inp.unread(d) # final check: is sentence long enough? # print '@4 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent if nAN > 1 and not inBrkt: break if len(sent) > 0 or self.last != END: return sent else: return None
def _store(self, defs, nowarn): """ put macro substitutions into table with indexing by first char of pattern arguments: self - defs - list of macro definition as strings nowarn - whether to turn warnings off exceptions: TableFailure on error """ # print ( defs.linecount() , 'lines' ) while True: l = defs.readline() # next macro rule # print ( "rule input=" , l ) if len(l) == 0: break # EOF check dl = definitionLine.DefinitionLine(l, False) left = dl.left # pattern to be matched tail = dl.tail # transformation to apply to match # print ( 'dl.left=' , left ) if left == None or tail == None: self._err(l=l) # report missing part of rule continue if left.find(' ') >= 0: # pattern side of macro rule ms = 'pattern in macro contains spaces' self._err(s=ms, l=l, d=1) # cannot contain any space chars continue lefts = list(left) # print ( 'left=' , lefts ) nspm = ellyWildcard.numSpaces(lefts) pat = ellyWildcard.convert( left) # get pattern with encoded wildcards if pat == None: self._err('bad wildcards', l) continue # print ( 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat) ) # print ( 'pat=' , list(pat) ) pe = pat[-1] if not pe in [ ellyWildcard.cALL, ellyWildcard.cEND, ellyWildcard.cSPC ]: pat += ellyWildcard.cEND # pattern must end in $ if it does not end in * or _ if not _checkBindings(pat, tail): self._err('bad bindings in substitution', l) continue if not nowarn and not _checkExpansion(pat, tail): self._err('substitution may be longer than original string', l, 0) # print ( "rule =" , [ left , nspm , tail ] ) if pat == None: self._err('no pattern', l) continue r = Rule(pat, nspm, tail) c = pat[0] # first char of pattern # check type to see how to index rule # print ( 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat) ) p = pat while c == ellyWildcard.cSOS: # optional sequence? if not cEOS in p: break k = p.index(cEOS) # if so, find the end of sequence if k < 0 or k == 1: break # if no end or empty sequence, stop k += 1 if k == len(pat): break # should be something after sequence m = ellyChar.toIndex( pat[1]) # index by first char of optional sequence self.index[m].append(r) # (must be non-wildcard) p = p[k:] # move up in pattern c = p[0] # but check for another optional sequence if c == ellyWildcard.cSOS: self._err(l=l) continue # bad sequence, skip this rule # print ( 'c=' , ord(c) ) if ellyChar.isLetterOrDigit( c): # check effective first char of pattern m = ellyChar.toIndex(c) self.index[m].append(r) # add to index under alphanumeric char elif ellyChar.isText(c): self.index[0].append(r) # add to index under punctuation elif not c in ellyWildcard.Matching: if c == ellyWildcard.cEND: print('** macro warning: pattern can have empty match', file=sys.stderr) print('* at [', l, ']', file=sys.stderr) else: dc = '=' + str(ord(c) - ellyWildcard.X) self._err('bad wildcard code', dc) continue elif c == ellyWildcard.cANY or c == ellyWildcard.cALL: self.anyWx.append(r) # under general wildcards elif c == ellyWildcard.cCAN: self.index[0].append(r) # under punctuation elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG: self.digWx.append(r) # under digit wildcards elif c == ellyWildcard.cSAN: self.digWx.append(r) # under both digit and self.letWx.append(r) # letter wildcards elif c == ellyWildcard.cAPO: # right single quote or apostrophe self.apoWx.append(r) # elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND: self._err('bad wildcard in context', l) continue # wildcards unacceptable here else: self.letWx.append(r) # everything else under letter wildcard self.count += 1 # count up macro substitution # print ( 'count=' , self.count ) if self._errcount > 0: print(self._errcount, 'macro errors in all', file=sys.stderr) print('macro table definition FAILed', file=sys.stderr) raise ellyException.TableFailure
def _store ( self , defs , nowarn ): """ put macro substitutions into table with indexing by first char of pattern arguments: self - defs - list of macro definition as strings nowarn - whether to turn warnings off exceptions: TableFailure on error """ # print defs.linecount() , 'lines' while True: l = defs.readline() # next macro rule # print "rule input=" , l if len(l) == 0: break # EOF check dl = definitionLine.DefinitionLine(l,False) left = dl.left # pattern to be matched tail = dl.tail # transformation to apply to match # print 'dl.left=' , left if left == None or tail == None: self._err(l=l) # report missing part of rule continue if left.find(' ') >= 0: # pattern side of macro rule ms = 'pattern in macro contains spaces' self._err(s=ms,l=l,d=1) # cannot contain any space chars continue lefts = list(left) # print 'left=' , lefts nspm = ellyWildcard.numSpaces(lefts) pat = ellyWildcard.convert(left) # get pattern with encoded wildcards if pat == None: self._err('bad wildcards',l) continue # print 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat) # print 'pat=' , list(pat) pe = pat[-1] if not pe in [ ellyWildcard.cALL , ellyWildcard.cEND , ellyWildcard.cSPC ]: pat += ellyWildcard.cEND # pattern must end in $ if it does not end in * or _ if not _checkBindings(pat,tail): self._err('bad bindings in substitution',l) continue if not nowarn and not _checkExpansion(pat,tail): self._err('substitution may be longer than original string',l,0) # print "rule =" , [ left , nspm , tail ] if pat == None: self._err('no pattern',l) continue r = Rule( pat , nspm , tail ) c = pat[0] # first char of pattern # check type to see how to index rule # print 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat) p = pat while c == ellyWildcard.cSOS: # optional sequence? if not cEOS in p: break k = p.index(cEOS) # if so, find the end of sequence if k < 0 or k == 1: break # if no end or empty sequence, stop k += 1 if k == len(pat): break # should be something after sequence m = ellyChar.toIndex(pat[1]) # index by first char of optional sequence self.index[m].append(r) # (must be non-wildcard) p = p[k:] # move up in pattern c = p[0] # but check for another optional sequence if c == ellyWildcard.cSOS: self._err(l=l) continue # bad sequence, skip this rule # print 'c=' , ord(c) if ellyChar.isLetterOrDigit(c): # check effective first char of pattern m = ellyChar.toIndex(c) self.index[m].append(r) # add to index under alphanumeric char elif ellyChar.isText(c): self.index[0].append(r) # add to index under punctuation elif not c in ellyWildcard.Matching: if c == ellyWildcard.cEND: print >> sys.stderr , '** macro warning: pattern can have empty match' print >> sys.stderr , '* at [' , l , ']' else: dc = '=' + str(ord(c) - ellyWildcard.X) self._err('bad wildcard code' , dc) continue elif c == ellyWildcard.cANY or c == ellyWildcard.cALL: self.anyWx.append(r) # under general wildcards elif c == ellyWildcard.cCAN: self.index[0].append(r) # under punctuation elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG: self.digWx.append(r) # under digit wildcards elif c == ellyWildcard.cSAN: self.digWx.append(r) # under both digit and self.letWx.append(r) # letter wildcards elif c == ellyWildcard.cAPO: # right single quote or apostrophe self.apoWx.append(r) # elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND: self._err('bad wildcard in context',l) continue # wildcards unacceptable here else: self.letWx.append(r) # everything else under letter wildcard self.count += 1 # count up macro substitution # print 'count=' , self.count if self._errcount > 0: print >> sys.stderr , '**' , self._errcount , 'macro errors in all' print >> sys.stderr , 'macro table definition FAILed' raise ellyException.TableFailure
[ '-' , "'" , ' ' ] , [ ':' , '-' , ')' ] , # emoticon [ 'x' , 'x' , 'x' , 'x' , '.' , ' ' , 'Y' ] , [ ' ' , '.' , ' ' ] , [ ' ' , 'm', '.' , ' ' , 'm' , 'o' , 'r' , 'r' , 'e' , 'l' , SQW , 's' , ' ' , 's' , 'a' , 'l' ] ] nlu = len(sys.argv) - 2 if nlu > 0: # add to test cases? for a in sys.argv[2:]: test.append(list(a.decode('utf8'))) print 'added' , nlu , 'test case' + ('' if nlu == 1 else 's') else: print 'no added test cases' print '--------' print len(test) , 'cases in all' for ts in test: ku = 0 lu = len(ts) for cu in ts: # scan input line ku += 1 if cu in stpx.lstg: # find first candidate stop if ku == lu or not ellyChar.isLetterOrDigit(ts[ku]): break # must not be followed by letter or digit else: continue res = stpx.match( ts[:ku-1] , ts[ku-1] , ts[ku] ) print '[ ' + ''.join(ts) + ' ]' , print 'stop EXCEPTION' if res else 'sentence stopped'
def _getRaw ( self ): """ obtain next raw token from buffer arguments: self returns: EllyToken on success, None otherwise """ self.skipSpaces() # print "|",len(self.buffer) ln = len(self.buffer) # print "|",len(self.buffer) if ln == 0: return None # print "proceed" ## get length of next token and if it has ## initial - or +, check for word fragment k = 0 # number of chars for next token if self.match(MIN): # check for hyphen if self.match(DSH): # it is a dash when doubled k = 2 else: k = self.find(separators,1) elif self.match(PLS): # check for elly prefix k = self.find(separators,1) elif self.match(DOT): # check for period if self.match(ELP): # it is ellipsis when tripled k = 3 else: k = 1 elif not ellyChar.isCombining(self.buffer[0]): k = 1 # if next char cannot start a token, take it as a token else: k = self.find(separators) if k < 0: # break a token at next separator k = ln while k < ln: # look at separator if it exists x = self.buffer[k] if x != MIN and x != COM: break # a hyphen or comma is not absolute break if not ellyChar.isDigit(self.buffer[k+1]): break # accept hyphen or comma if NOT followed by digit else: # otherwise, look for another separator k = self.find(separators,k+2) if k < 0: k = ln ## if token not delimited, take rest of buffer as ## will fit into token working area if k < 0: k = ln # print "take",k,"chars from",len(self.buffer),self.buffer buf = self.extract(k) # get k characters ## special check for - next in buffer after extraction if self.match(MIN): # hyphen immediately following? self.skip() # if so, take it if self.atSpace(): # when followed by space buf.append(MIN) # append hyphen to candidate token k += 1 else: if not self.match(MIN): # when not followed by another hyphen self.prepend(ellyChar.SPC) # put back a space else: self.skip() # double hyphen = dash self.prepend(ellyChar.SPC) # put back space after dash self.prepend(MIN) # put back second hyphen self.prepend(MIN) # put back first self.prepend(ellyChar.SPC) # put extra space before hyphen or dash ## fill preallocated token for current position from working area # print "raw text for token:" , '[' + u''.join(buf).encode('utf8') + ']' to = ellyToken.EllyToken(u''.join(buf)) ## strip off trailing non-token chars from token and put back in buffer km = k - 1 while km > 0: x = buf[km] if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS: break if x == APO and km > 0 and buf[km - 1] == 's': break self.prepend(x) km -= 1 km += 1 if km < k: to.shortenBy(k - km,both=True) return to
def _aDay(self, ts): """ parse a day number arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'aDay', ts if len(ts) == 0: return 0 k = 0 # running match count x = ts[0] y = '' if not ellyChar.isDigit(x): if not self.rewriteNumber(ts): return 0 else: x = ts[0] # print 'rewritten ts=' , ts ls = len(ts) if ls == 1: if x == '0': return 0 # cannot have 0 as day self._dy.append(x) # accept at end of input as possible date return 1 elif not ellyChar.isDigit(ts[1]): k = 1 elif x > '3': # reject first digit bigger than '3' return 0 else: y = x # save first digit x = ts[1] # this will be second digit if y == '3' and x > '1': # reject day > 31 return 0 k = 2 ls -= k if k == 2: self._dy.append(y) self._dy.append(x) if ls == 0: return k z = ts[k] if ellyChar.isDigit(z): return 0 # reject 3-digit day if z == '.' and ls > 1 and ellyChar.isDigit(ts[k + 1]): return 0 # reject digit after decimal point if ls >= 2: # at least 2 chars to check after day number if z == u'-': # print 'hypen ls=' , ls , 'k=' , k if ellyChar.isDigit(ts[k + 1]): # hyphen, digit match # print 'digit=' , ts[k+1] self._dy.append(z) self._dy.append(ts[k + 1]) if ls == 2: # only 2 chars to check? k += 2 # add hyphen, digit to day elif ls == 3: # only 3 chars to check? # print 'ts[k]=' , ts[k:] if not ellyChar.isLetterOrDigit(ts[k + 2]): # k += 2 # add hyphen, digit to day elif ellyChar.isDigit( ts[k + 2]): # found second digit to add? self._dy.append(ts[k + 2]) # if so, add to day string k += 3 elif not ellyChar.isLetterOrDigit( ts[k + 2]): # more than 3 chars to check? k += 2 # if not, we are done elif ellyChar.isDigit(ts[k + 2]): # check for second digit # print 'k=' , k if ls > 3 and ellyChar.isDigit(ts[k + 3]): return 0 if ts[k + 1] > '3': # check for valid day return 0 if ts[k + 1] == '3' and ts[k + 2] > '1': return 0 self._dy.append(ts[k + 2]) k += 3 else: return 0 # no other hyphen allowed in day else: return 0 # t = ts[k:] # print 'k=' , k , 't=' , t if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]): return k if ellyChar.isDigit(t[0]) or len(t) < 2: return 0 sx = t[0].lower() + t[1].lower() # print 'y=' , y , 'x=' , x , 'sx=' , sx if x == '1': # print 'end of day=' , y if y == '1': if sx != 'th': return 0 elif sx != 'st': return 0 elif x == '2': if sx != 'nd': return 0 elif x == '3': if sx != 'rd': return 0 else: # print 'default ordinal indicator' if sx != 'th': return 0 # print 'ord k=' , k t = t[2:] k += 2 # print 'k=' , k , 'len=' , len(ts) if len(ts) == k: # check next char in stream return k # if none, match succeeds elif ellyChar.isLetterOrDigit(ts[k]): # print 'ts[k]=' , ts[k] , k return 0 # otherwise, match fails if next char is alphanumeric else: # print 'return k=' , k return k # otherwise succeed
def _matchN(self, ts): """ apply logic for numeric only date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'NUMERIC' lts = len(ts) if lts < Lm: return 0 # shortest date is 0/0 if not ellyChar.isDigit(ts[0]): return 0 n = Ln if n > lts: n = lts ss = [] # substring to compare ns = 0 # slash count # print 'lts=' , lts , 'n=' , n k = 0 while k < n: c = ts[k] if c == '/': ns += 1 elif c == '-': ns += 1 c = '/' elif not ellyChar.isDigit(c): break ss.append(c) k += 1 # print 'k=', k , 'Lm=' , Lm , 'ns=' , ns if k < Lm: return 0 if ns != 1 and ns != 2: return 0 # print 'ss=' , ss if k < lts and ellyChar.isLetterOrDigit(ts[k]): return 0 dt = ''.join(ss).split('/') dt0 = dt.pop(0) # get first two date components dt1 = dt.pop(0) # # print 'split=' , dt0 , dt1 if len(dt0) == 4 or dt0[0] == '0': if ns == 1: return 0 # dt.append(dt0) # put first component at end if it looks like year dt0 = dt1 # move month up dt1 = dt.pop() # move date up m = int(dt0) if m < 1 or m > 12: return 0 # check validity of month if dt1 == '': return 0 try: d = int(dt1) except ValueError: return 0 if d < 1 or d > 31: return 0 # check validity of day if ns == 2: y = dt.pop(0) # if there is a year, process it also ly = len(y) if ly == 4: # 4-digit year? s = y[0] if s != '1' and s != '2': return 0 yls = list(y) elif ly == 2: ix = 0 if y > self.ycur else 1 yls = list(self.cent[ix] + y) else: return 0 # fail on any other number of year digits self._yr = yls # handle year self._mo = list(dt0.zfill(2)) # handle month self._dy = list(dt1.zfill(2)) # handle day return k
def _extractToken(self, mnl): """ extract next token from input buffer and look up in grammar table arguments: self - mnl - minimum length for any previous match returns: ellyToken on success, otherwise None exceptions: ParseOverflow """ d = self.rul # grammar rule definitions tree = self.ptr # parse tree buff = self.sbu # input source # print ( 'start extraction' ) try: w = buff.getNext() # extract next token # print ( 'got token=' , w ) ws = ''.join(w.root) except ellyException.StemmingError as e: # print ( 'FATAL error' , e , file=sys.stderr ) sys.exit(1) # print ( 'extracted' , '['+ ws + ']' ) wcapzn = w.isCapitalized() wsplit = w.isSplit() wl = len(ws) if wl > mnl: found = self._simpleTableLookUp(ws, tree, wsplit, wcapzn) > 0 # print ( 'found in external table=' , found ) if wl >= mnl: if ws in self.rul.gtb.dctn: # look up internally # print ( v'"' + ws + '" in dictionary' ) if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn): found = True # print ( 'found in internal dictionary=' , found ) if found: # if any success, we are done return w if mnl > 0: return None # defer to previous lookup # print ( 'affix logic:' ) # print ( d.man.pref ) # print ( d.man.suff ) dvdd = False if d.man.analyze(w): # any analysis possible? root = ''.join(w.root) # if so, get parts of analysis tan = w.pres + [root] + w.sufs if len(w.sufs) > 0: sx = w.sufs[-1] dvdd = not ellyChar.isApostrophe(sx[1]) # print ( 'token analysis=' , tan ) while len(tan) > 0: # and put back into input x = tan.pop() buff.prepend(x) buff.prepend(' ') w = buff.getNext() # get token again with stemming and macros # print ( 'analyzed w=' , w ) ws = ''.join(w.root) if ws[-1] == '+': # print ( 'len(queue)=' , len(tree.queue) ) m = d.ptb.match(w.root, tree) # print ( 'root=' , w.root ) # print ( 'match=' , m ) # print ( 'len(queue)=' , len(tree.queue) ) # print ( 'char span=' , tree.lastph.lens ) if m > 0: tree.lastph.bias = 2 found = True # print ( 'after found=' , found ) if len(ws) < mnl: return None # external lookup? if self._simpleTableLookUp(ws, tree, False, wcapzn): # external lookup found = True if ws in self.rul.gtb.dctn: # internal lookup? if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn): found = True if found: # if any success, we are done # print ( 'token recognized' ) w.dvdd = dvdd return w # print ( 'still unrecognized token w=' , str(w) ) lws = len(ws) if lws > 1: # special handling of + or - if ws[0] == '+' and ws[-1] != '+': # print ( 'root=' , ws ) # marks root with prefixes removed if self._simpleTableLookUp(ws[1:], tree) > 0: return w if ws[0] == '-': w.shortenBy(lws - 1) # -X not recognized as suffix # print ( 'w=' , w ) # try processing - separately cn = buff.peek() if ellyChar.isLetterOrDigit(cn): buff.prepend(' ') buff.prepend(ws[1:]) # put back X for further analysis if self.pnc.match(w.root): # check if next token is punctuation # print ( 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal() ) if tree.addLiteralPhrase(self.pnc.catg, self.pnc.synf): tree.lastph.lens = w.getLength() tree.lastph.krnl.semf.combine(self.pnc.semf) # print ( 'semf=' , self.pnc.semf ) # print ( 'lastph=' , tree.lastph ) # print ( 'punc w=' , str(w) ) else: # print ( 'must create UNKN leaf node' ) tree.createUnknownPhrase(w) # unknown type as last resort tree.lastph.lens = len(ws) return w
list('XXX: Boo'), list('S.A.F. \u201cA'), list('2002. \u201cA'), list('2:45 a.m. Friday') ] nlu = len(sys.argv) - 2 if nlu > 0: # add to test cases? for a in sys.argv[2:]: # get commandline args to test test.append(list(a)) print('added', nlu, 'test case' + ('' if nlu == 1 else 's')) else: print('no added test cases') print('--------') print(len(test), 'cases in all') for ts in test: ku = 0 lu = len(ts) for cu in ts: # scan input line ku += 1 if cu in stpx.lstg: # find first candidate stop if ku == lu or not ellyChar.isLetterOrDigit(ts[ku]): break # must not be followed by letter or digit else: print(ts, 'SKIPPED') continue res = stpx.match(ts[:ku - 1], ts[ku - 1], ts[ku:]) print('[ ' + ''.join(ts) + ' ] @', ku - 1, end=' ') print('stop EXCEPTION' if res else 'sentence will stop')