Пример #1
0
    def get ( self , ts , n=N ):

        """
        get normalized substring in lower case for subsequent comparisons

        arguments:
            self -
            ts   - list of chars to get substring from
            n    - limit on count of chars to get

        returns:
            count of chars scanned for substring
        """

        sl = [ ]                          # char sublist to be matched
#       print 'ts=' , ts
        lts = len(ts)
        if lts == 0:
            return 0                      # no chars to scan
        lm = lts if lts < n else n
#       print 'lm=' , lm
        i = 0
        c = ''
        while i < lm:                     # scan input text up to char limit
            lc = c
            c = ts[i]                     # get next char
            if c == PERIOD:               # special treatment of PERIOD
                if lc == PERIOD: break
            elif c == COMMA:              # special treatment of COMMA
#               print 'comma'
                if ( not ellyChar.isDigit(lc) or
                     i + 1 == lm or
                     not ellyChar.isDigit(ts[i + 1])
                   ):
                    break
            else:
                if not ellyChar.isLetterOrDigit(c):  # stop if not letter
                    if not c in ALSO: break          #   or "'" or "/" or "-"
                sl.append(c.lower())                 # otherwise append to sublist
            i += 1

#       print 'i=' , i , '<' + c + '>'

        if i < lm and ellyChar.isLetterOrDigit(ts[i]):     # proper termination?
            return 0                                       # if not, reject substring

        self.string = u''.join(sl)
        return i                          # scan count
Пример #2
0
    def __init__ ( self , syms , fets=None , semantic=False ):

        """
        initialization

        arguments:
            self     -
            syms     - symbol table
            fets     - string representation of feature set
            semantic - flag for semantic features

        exceptions:
            FormatFailure on error
        """

        if syms == None or fets == None:  # special case generating zero feature set
            self.positive = ellyBits.EllyBits(symbolTable.FMAX)
            self.negative = ellyBits.EllyBits(symbolTable.FMAX)
            self.id = ''
            return

        segm = fets.lower()
#       print "features=",segm,"semantic=",semantic
        if segm == None or len(segm) < 3 or segm[0] != '[' or segm[-1] != ']':
            raise ellyException.FormatFailure
        elif segm[1] == ' ' or ellyChar.isLetterOrDigit(segm[1]) or segm[1] == '*':
            raise ellyException.FormatFailure
        else:
            self.id = segm[1]
#           print "id=",self.id
            fs = syms.getFeatureSet(segm[1:-1] , semantic)
            if fs == None:
                raise ellyException.FormatFailure
            self.positive , self.negative = fs
Пример #3
0
def bound ( segm ):

    """
    get maximum limit on string for pattern matching
    (override this method if necessary)

    arguments:
        segm  - text segment to match against

    returns:
        char count
    """

    lm = len(segm)   # limit can be up to total length of text for matching
    ll = 0
    while ll < lm:   # look for first space in text segment
        if segm[ll] == ' ': break
        ll += 1
#   print 'll=' , ll , ', lm=' , lm
    ll -= 1
    while ll > 0:    # exclude trailing non-alphanumeric from matching
                     # except for '.' and '*' and bracketing
        c = segm[ll]
        if c in Trmls or c == '*' or ellyChar.isLetterOrDigit(c): break
        ll -= 1
    return ll + 1
Пример #4
0
def bound ( segm ):

    """
    get maximum limit on string for template matching
    (override this method if necessary)

    arguments:
        segm  - text segment to match against

    returns:
        char count
    """

#   print 'segm=' , segm
    lm = len(segm)   # limit can be up to total length of text for matching
    ll = 0
    while ll < lm:   # look for first break in text segment
        c = segm[ll]
        if c in [ ellyChar.ELLP , ellyChar.NDSH , ellyChar.MDSH ]:
            break
        if c == ',' and ll < lm - 1 and segm[ll+1] == ' ':
            break
        ll += 1
#   print 'll=' , ll , ', lm=' , lm
    ll -= 1
    while ll > 0:    # exclude trailing non-alphanumeric from matching
                     # except for '.' and '*' and bracketing
        c = segm[ll]
        if c in ellyWildcard.Trmls or c == '*' or ellyChar.isLetterOrDigit(c): break
        ll -= 1
#   print 'limit=' , ll + 1
    return ll + 1
Пример #5
0
    def findSeparator(self, skip=0):
        """
        scan for one of a list of separator chars in buffer

        arguments:
            self  -
            skip  - how many chars to skip in buffer to start scan

        returns:
            offset in buffer if char found, -1 otherwise
        """

        n = len(self.buffer)
        if skip >= n:  # is skip too long?
            return -1  # if so, fail

        if skip == 0 and self.buffer[0] == APO:
            return 1  # special case!

#       print ( 'skip=' , skip, 'n=' , n )
        for k in range(skip, n):
            ck = self.buffer[k]
            if ck in separators:  # is buffer char a separator?
                self.index = k  # if so, note buffer position
                return k
            if ck == ellyChar.COM:
                #               print ( 'comma k=' , k )
                if k + 1 < n:
                    ck1 = self.buffer[k + 1]
                    if not ellyChar.isLetterOrDigit(ck1):
                        if ck1 in ellyChar.Grk:
                            return k

        return -1  # fail
Пример #6
0
def acronym ( buffr ):

    """
    recognize parenthesized introduction of acronym in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Lmax: lb = Lmax
    if lb < Lmin or buffr[0] != '(': return 0

    nu = 0          # uppercase count
    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == ')':
            break
        if not ellyChar.isLetter(bc): return 0
        if ellyChar.isUpperCaseLetter(bc): nu += 1
    else:
        return 0    # must have enclosing ')'

    if ib < Lmin or ib - 2*nu > 0: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Пример #7
0
    def __init__ ( self , syms , fets=None , semantic=False ):

        """
        initialization

        arguments:
            self     -
            syms     - symbol table
            fets     - string representation of feature set
            semantic - flag for semantic features

        exceptions:
            FormatFailure on error
        """

        if syms == None or fets == None:  # special case generating zero feature set
            self.positive = ellyBits.EllyBits(symbolTable.FMAX)
            self.negative = ellyBits.EllyBits(symbolTable.FMAX)
            self.id = ''
            return

        segm = fets.lower()
#       print "features=",segm,"semantic=",semantic
        if segm == None or len(segm) < 3 or segm[0] != '[' or segm[-1] != ']':
            raise ellyException.FormatFailure
        elif segm[1] == ' ' or ellyChar.isLetterOrDigit(segm[1]) or segm[1] == '*':
            raise ellyException.FormatFailure
        else:
            self.id = segm[1]
#           print "id=",self.id
            fs = syms.getFeatureSet(segm[1:-1] , semantic)
#           print 'fs=' , str(fs[0]) , ',' , str(fs[1])
            if fs == None:
                raise ellyException.FormatFailure
            self.positive , self.negative = fs
Пример #8
0
def bound(segm):
    """
    get maximum limit on string for pattern matching
    (override this method if necessary)

    arguments:
        segm  - text segment to match against

    returns:
        char count
    """

    #   print 'segm=' , segm
    lm = len(segm)  # limit can be up to total length of text for matching
    ll = 0
    while ll < lm:  # look for first space in text segment
        if segm[ll] == ' ': break
        ll += 1
#   print 'll=' , ll , ', lm=' , lm
    ll -= 1
    while ll > 0:  # exclude trailing non-alphanumeric from matching
        # except for '.', Unicode prime, or  '*' and bracketing
        c = segm[ll]
        #       print 'bound c=' , c , '{:x}'.format(ord(c))
        if (c in Trmls or c in [u'*', u'\u2032', u'+']
                or ellyChar.isLetterOrDigit(c)):
            break
        ll -= 1


#   print 'limit=' , ll + 1
    return ll + 1
Пример #9
0
    def getRules(self, a):
        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

        #       print ( 'getRules(a=' , a , ')' )
        if a == '': return []
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
            #           print ( 'index a=' , a , 'k=' , k )
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls, ws)
            uniqueAdd(ls, self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls, self.anyWx)
#       print ( len(ls) , ' rules to check' )
        return [r.unpack() for r in ls]
Пример #10
0
def acronym ( buffr ):

    """
    recognize parenthesized introduction of acronym in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Lmax: lb = Lmax
    if lb < Lmin or buffr[0] != '(': return 0

    nu = 0          # uppercase count
    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == ')':
            break
        if not ellyChar.isLetter(bc): return 0
        if ellyChar.isUpperCaseLetter(bc): nu += 1
    else:
        return 0    # must have enclosing ')'

    if ib < Lmin or ib - 2*nu > 0: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Пример #11
0
def bound ( segm ):

    """
    get maximum limit on string for template matching
    (override this method if necessary)

    arguments:
        segm  - text segment to match against

    returns:
        char count
    """

#   print ( 'segm=' , segm )
    lm = len(segm)   # limit can be up to total length of text for matching
    ll = 0
    while ll < lm:   # look for first break in text segment
        c = segm[ll]
        if c in [ ellyChar.ELLP , ellyChar.NDSH , ellyChar.MDSH ]:
            break
        if c == ',' and ll < lm - 1 and segm[ll+1] == ' ':
            break
        ll += 1
#   print ( 'll=' , ll , ', lm=' , lm )
    ll -= 1
    while ll > 0:    # exclude trailing non-alphanumeric from matching
                     # except for '.' and '*' and bracketing
        c = segm[ll]
        if c in ellyWildcard.Trmls or c == '*' or ellyChar.isLetterOrDigit(c): break
        ll -= 1
#   print ( 'limit=' , ll + 1 )
    return ll + 1
Пример #12
0
    def getRules ( self , a ):

        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

#       print 'getRules(a=' , a , ')'
        if a == '': return [ ]
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
#           print 'index a=' , a , 'k=' , k
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls,ws)
            uniqueAdd(ls,self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls,self.anyWx)
#       print len(ls) , ' rules to check'
        return [ r.unpack() for r in ls ]
Пример #13
0
def delimitKey ( t ):

    """
    get part of term for vocabulary table indexing that
    ends in alphanumeric or is a single nonalphanumeric
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to put into search key
    """

    ln = len(t)                   # number of chars in input text
    if ln == 0: return 0
    n = t.find(' ')               # find rough range of key for SQLite in text
    if n < 0: n = ln              # if undivided by spaces, take everything
    n -= 1                        # index of last char in range
    while n > 0:                  # scan input text backwards
        c = t[n]                  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
#           print 'n=' , n , 'c=' , c
            if n > 1:             # check for 'S as special case!
                if ( c in [ 's' , 'S' ] and
                     ellyChar.isApostrophe(t[n-1]) ):
#                   print 'drop \'S from SQLite key'
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1                    # continue scanning backwards
    return n + 1                  # to get key length ending in alphanumeric
Пример #14
0
    def load ( self , stb , defn ):

        """
        get templates and user-defined word classes from input

        arguments:
            self  -
            stb   - Elly symbol table
            defn  - Elly definition reader for classes and templates

        exceptions:
            TableFailure on error
        """

        clss = [ ]                            # element classes
        while True:
            l = defn.readline()               # next definition line
            if len(l) == 0: break             # EOF check
            s = l.split(':=')                 # look for user-defined class
            if len(s) == 2:
                nme = s[0].strip()
                if len(nme) != 2 and not ellyChar.isLetterOrDigit(nme[1]):
                    self._err('improper class ID')
                    continue
                if nme in preClass:
                    self._err('cannot change predefined classes')
                    continue
                ls = s[1].split(',')             # list of words for class
                ls = list(w.strip() for w in ls) # just in case of extra spaces
#               print 'for class, ls=' , ls
                if not nme in self.ucls:      # define a new template category?
                    self.ucls[nme] = [ ]
                self.ucls[nme].extend(ls)     # add list of words to class
#               print 'class=' , self.ucls[nme]
                self.cfns[nme] = None
            else:
                tm = Template(stb,l)          # create a new template
                if tm.check() > 0:            # any problem here is fatal
#                   print 'template error'
                    self._errcount += 1
                    continue
                for elm in tm.lstg:           # get unique template categories
                    if elm[0] == Catg:
                        if not elm in clss:
                            clss.append(elm)
                self.tmpl.append(tm)          # add template to table

        missg = [ ]                           # to collect missing definitions
        for cx in clss:                       # check user-defined categories
            if not cx in self.cfns:
                if cx[1] != '*':
                    missg.append(cx)          # note if unsupported by class list
        lm = len(missg)
        if lm > 0:                            # this is a FATAL error
            print >> sys.stderr , lm , 'undefined template categories=' , missg
            self._errcount += lm
        if self._errcount > 0:
            print >> sys.stderr , 'table error count=' , self._errcount
            raise ellyException.TableFailure('templates')
Пример #15
0
    def load ( self , stb , defn ):

        """
        get templates and user-defined word classes from input

        arguments:
            self  -
            stb   - Elly symbol table
            defn  - Elly definition reader for classes and templates

        exceptions:
            TableFailure on error
        """

        clss = [ ]                            # element classes
        while True:
            l = defn.readline()               # next definition line
            if len(l) == 0: break             # EOF check
            s = l.split(':=')                 # look for user-defined class
            if len(s) == 2:
                nme = s[0].strip()
                if len(nme) != 2 and not ellyChar.isLetterOrDigit(nme[1]):
                    self._err('improper class ID')
                    continue
                if nme in preClass:
                    self._err('cannot change predefined classes')
                    continue
                ls = s[1].split(',')             # list of words for class
                ls = list(w.strip() for w in ls) # just in case of extra spaces
#               print ( 'for class, ls=' , ls )
                if not nme in self.ucls:      # define a new template category?
                    self.ucls[nme] = [ ]
                self.ucls[nme].extend(ls)     # add list of words to class
#               print ( 'class=' , self.ucls[nme] )
                self.cfns[nme] = None
            else:
                tm = Template(stb,l)          # create a new template
                if tm.check() > 0:            # any problem here is fatal
#                   print ( 'template error' )
                    self._errcount += 1
                    continue
                for elm in tm.lstg:           # get unique template categories
                    if elm[0] == Catg:
                        if not elm in clss:
                            clss.append(elm)
                self.tmpl.append(tm)          # add template to table

        missg = [ ]                           # to collect missing definitions
        for cx in clss:                       # check user-defined categories
            if not cx in self.cfns:
                if cx[1] != '*':
                    missg.append(cx)          # note if unsupported by class list
        lm = len(missg)
        if lm > 0:                            # this is a FATAL error
            print ( lm , 'undefined template categories=' , missg , file=sys.stderr )
            self._errcount += lm
        if self._errcount > 0:
            print ( 'table error count=' , self._errcount , file=sys.stderr )
            raise ellyException.TableFailure('templates')
Пример #16
0
    def __init__ ( self , syms , spec ):

        """
        initialization from input string and symbol table

        arguments:
            self  -
            syms  - current symbol table
            spec  - input string

        exceptions:
            FormatFailure on error
        """

        self.catg = -1    # values to set on an error
        self.synf = None  #

#       print >> sys.stderr , 'specification=' , spec
        if spec == None:
            print >> sys.stderr , '** null syntax specification'
            raise ellyException.FormatFailure

        s = spec.lower()  # must be lower case for all lookups

        n = 0
        for c in s:
            if not ellyChar.isLetterOrDigit(c) and c != '.': break
            n += 1

        if n == 0:
            print >> sys.stderr , '** no syntactic category'
            raise ellyException.FormatFailure

        typs = s[:n]        # save category name
#       print >> sys.stderr , 'catg=' , self.catg
        catg = syms.getSyntaxTypeIndexNumber(typs)

        s = s[n:].strip()   # feature part of syntax

        if len(s) == 0:     # check if there are any features
            synf = featureSpecification.FeatureSpecification(syms,None)
            if typs == '...':
                synf.id = '...'
        elif typs == '...': # ... category may have no features!
            raise ellyException.FormatFailure
        else:               # decode features
#           print >> sys.stderr , 'syms=' , syms , 's=' , s
            if len(s) > 3 and typs in catid and catid[typs] != s[1]:
                print >> sys.stderr , '** type' , typs.upper() , 'has two feature IDs:' , catid[typs] , s[1]
                raise ellyException.FormatFailure
            catid[typs] = s[1]
            synf = featureSpecification.FeatureSpecification(syms,s)

        # FormatFailure exception may be raised above, but will not be caught here

#       print >> sys.stderr , 'success'
        self.catg = catg
        self.synf = synf
Пример #17
0
    def get(self, ts, n=N):
        """
        get normalized substring in lower case for subsequent comparisons

        arguments:
            self -
            ts   - list of chars to get substring from
            n    - limit on count of chars to get

        returns:
            count of chars scanned for substring
        """

        sl = []  # char sublist to be matched
        #       print ( 'ts=' , ts )
        lts = len(ts)
        if lts == 0:
            return 0  # no chars to scan
        lm = lts if lts < n else n
        #       print ( 'lm=' , lm )
        i = 0
        c = ''
        while i < lm:  # scan input text up to char limit
            lc = c
            c = ts[i]  # get next char
            if c == COMMA:  # special treatment of COMMA
                #               print ( 'comma' )
                if (not ellyChar.isDigit(lc) or i + 3 >= lm
                        or not ellyChar.isDigit(ts[i + 1])
                        or not ellyChar.isDigit(ts[i + 2])
                        or not ellyChar.isDigit(ts[i + 3])):
                    break
            else:
                if not ellyChar.isLetterOrDigit(c):  # stop if not letter
                    if not c in ALSO: break  #   or "'" or "/" or "-"
                sl.append(c.lower())  # otherwise append to sublist
            i += 1

#       print ( 'i=' , i , '<' + c + '>' )

        if i < lm and ellyChar.isLetterOrDigit(ts[i]):  # proper termination?
            return 0  # if not, reject substring

        self.string = ''.join(sl)
        return i  # scan count
Пример #18
0
    def getFeatureSet ( self , fs , ty=False ):

        """
        get feature index associated with given name in given set

        arguments:
            self  -
            fs    - feature set without enclosing brackets
            ty    - False=syntactic, True=semantic

        returns:
            list of EllyBits [ positive , negative ] on success, None on failure
        """

        if len(fs) < 1: return None

        bp = ellyBits.EllyBits(FMAX) # all feature bits zeroed
        bn = ellyBits.EllyBits(FMAX) #

        fsx = self.sxindx if not ty else self.smindx
#       print '--------  fs=' , fs
        fid = fs[0]                # feature set ID
        fnm = fs[1:].split(',')    # feature names
        if not fid in fsx:         # known ID?
            fsx[fid] = { }         # if not, make it known
        h = fsx[fid]               # for hashing of feature names
        if len(fnm) == 0:          # check for empty features
            return [ bp , bn ]
        for nm in fnm:
            nm = nm.strip()
            if len(nm) == 0: continue
            if nm[0] == '-':       # negative feature?
                b = bn             # if so, look at negative bits
                nm = nm[1:]
            elif nm[0] == '+':     # positive feature?
                b = bp             # if so, look at positive bits
                nm = nm[1:]
            else:
                b = bp             # positive bits by default

#           print '--------  nm=' , nm
            for c in nm:           # check feature name
                if not ellyChar.isLetterOrDigit(c) and c != '*':
                    return None
            if not nm in h:        # new name in feature set?
                k = len(h)         # if so, define it
                l = FMAX           # limit for feature index
                if not ty:         # adjustment for extra predefined 
                    k -= 3         # syntactic feature names *L and *R
                    l -= 1         # and for *UNIQUE
                if k == l:         # overflow check
                    print >> sys.stderr, '+* too many feature names'
                    return None
                h[nm] = k

            b.set(h[nm])           # set bit for feature
        return [ bp , bn ]
Пример #19
0
def delimitKey(t):
    """
    get bounds of vocabulary table key for looking up a term
    starting at the front of a given text string
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to take for search key
    """

    ln = len(t)  # number of chars in input text
    if ln == 0: return 0
    if not ellyChar.isLetterOrDigit(t[0]): return 1

    #   print ( 'delimitKey t=' , t )

    k = t.find('-')  # find rough range of SQLite key in text
    n = t.find(' ')  # delimited by either a hyphen or a space
    if n < 0: n = ln  # if space, take everything
    if k > 1 and n > k: n = k  # hyphen delimits if it comes first
    n -= 1  # index of last char of candidate key
    #   print ( 'k=' , k , 'n=' , n )

    while n > 0:  # scan input text backwards
        c = t[n]  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
            #           print ( 'n=' , n , 'c=' , c )
            if n > 1:  # check for 'S as special case!
                if (c in ['s', 'S'] and ellyChar.isApostrophe(t[n - 1])):
                    #                   print ( 'drop \'S from SQLite key' )
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1  # continue scanning backwards


#   print ( 'key=' , t[:n+1] )
    return n + 1  # to get key length ending in alphanumeric
Пример #20
0
def delimitKey ( t ):

    """
    get bounds of vocabulary table key for looking up a term
    starting at the front of a given text string
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to take for search key
    """

    ln = len(t)                   # number of chars in input text
    if ln == 0: return 0
    if not ellyChar.isLetterOrDigit(t[0]): return 1

#   print 'delimitKey t=' , t

    k = t.find('-')               # find rough range of SQLite key in text
    n = t.find(' ')               # delimited by either a hyphen or a space
    if n < 0: n = ln              # if space, take everything
    if k > 1 and n > k: n = k     # hyphen delimits if it comes first
    n -= 1                        # index of last char of candidate key
#   print 'k=' , k , 'n=' , n

    while n > 0:                  # scan input text backwards
        c = t[n]                  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
#           print 'n=' , n , 'c=' , c
            if n > 1:             # check for 'S as special case!
                if ( c in [ 's' , 'S' ] and
                     ellyChar.isApostrophe(t[n-1]) ):
#                   print 'drop \'S from SQLite key'
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1                    # continue scanning backwards
#   print 'key=' , t[:n+1]
    return n + 1                  # to get key length ending in alphanumeric
Пример #21
0
def stateZip(buffr):
    """
    recognize U.S. state abbreviation and zip code

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    if len(buffr) < 8 or buffr[2] != ' ': return 0
    st = ''.join(buffr[:2]).upper()  # expected 2-char state abbreviation
    if not st in ziprs: return 0  # if not known, quit
    zc = ziprs[st]  # get zip-code start
    b = buffr[3:]  # expected start of zipcode
    i = 0
    for c in zc:  # check starting digits of zipcode
        if c != b[i]: return 0
        i += 1
    while i < 5:  # check for digits in rest of zipcode
        if not ellyChar.isDigit(b[i]): return 0
        i += 1
    b = b[5:]  # look for proper termination
    if len(b) == 0:  # if end of input, success
        return 8  # success: 5-digit zip
    c = b[0]
    if ellyChar.isLetterOrDigit(c):  # if next char is alphanumeric, failure
        return 0
    elif b[0] == '-':  # look for possible 9-digit zip
        if len(b) > 5:
            b = b[1:]
            for i in range(4):
                if not ellyChar.isDigit(b[i]):
                    return 0  # check for 4 more digits
            b = b[4:]  # past end of 4 digits
            if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]):
                return 0  # termination check
            return 8 + 5  # success: 9-digit zip
    else:
        return 8  # success: 5-digit zip
Пример #22
0
def terminate(ss, sp, lss=None):
    """
    check char for termination of match
    arguments:
        ss  - char input stream
        sp  - char position in stream
    returns:
        True if terminating char or past end of input, False otherwise
    """

    if lss == None: lss = len(ss)
    return True if sp >= lss else not ellyChar.isLetterOrDigit(ss[sp])
Пример #23
0
def stateZip ( buffr ):

    """
    recognize U.S. state abbreviation and zip code

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    if len(buffr) < 8 or buffr[2] != ' ': return 0
    st = ''.join(buffr[:2]).upper()    # expected 2-char state abbreviation
    if not st in ziprs: return 0       # if not known, quit
    zc = ziprs[st]                     # get zip-code start
    b = buffr[3:]                      # expected start of zipcode
    i = 0
    for c in zc:                       # check starting digits of zipcode
        if c != b[i]: return 0
        i += 1
    while i < 5:                       # check for digits in rest of zipcode
        if not ellyChar.isDigit(b[i]): return 0
        i += 1
    b = b[5:]                          # look for proper termination
    if len(b) == 0:                    # if end of input, success
        return 8                       # success: 5-digit zip
    c = b[0]
    if ellyChar.isLetterOrDigit(c):    # if next char is alphanumeric, failure
        return 0
    elif b[0] == '-':                  # look for possible 9-digit zip
        if len(b) > 5:
            b = b[1:]
            for i in range(4):
                if not ellyChar.isDigit(b[i]): return 0 # check for 4 more digits
            b = b[4:]                                   # past end of 4 digits
            if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]): return 0 # termination check
            return 8 + 5                                # success: 9-digit zip
    else:
        return 8                       # success: 5-digit zip
Пример #24
0
def terminate ( ss , sp , lss=None ):

    """
    check char for termination of match
    arguments:
        ss  - char input stream
        sp  - char position in stream
    returns:
        True if terminating char or past end of input, False otherwise
    """

    if lss == None: lss = len(ss)
    return True if sp >= lss else not ellyChar.isLetterOrDigit(ss[sp])
Пример #25
0
def _terminate ( c ):

    """
    check char for termination of match

    arguments:
        c  - char to check

    returns:
        True if termination, False otherwise
    """

    return not ellyChar.isLetterOrDigit(c)
Пример #26
0
    def getConcept ( self , name ):

        """
        get concept, creating if necessary in index under name

        arguments:
            self  -
            name  - of concept

        returns:
            concept for name other than NOname, otherwise None
        """

        name = name.strip().upper()
        if name == NOname: return None
        elif name == TOP: return self.index[TOP]
        elif len(name) == 0 or not ellyChar.isLetterOrDigit(name[0]): return None
        elif name in self.index: return self.index[name]
        for x in name:
            if not ellyChar.isLetterOrDigit(x): return None
        c = Concept(name)
        self.index[name] = c
        return c
Пример #27
0
    def getConcept(self, name):
        """
        get concept, creating if necessary in index under name

        arguments:
            self  -
            name  - of concept

        returns:
            concept for name other than NOname, otherwise None
        """

        name = name.strip().upper()
        if name == NOname: return None
        elif name == TOP: return self.index[TOP]
        elif len(name) == 0 or not ellyChar.isLetterOrDigit(name[0]):
            return None
        elif name in self.index:
            return self.index[name]
        for x in name:
            if not ellyChar.isLetterOrDigit(x): return None
        c = Concept(name)
        self.index[name] = c
        return c
Пример #28
0
    def __init__ ( self , syms , fets=None , semantic=False ):

        """
        initialization

        arguments:
            self     -
            syms     - symbol table
            fets     - string representation of feature set
            semantic - flag for semantic features

        exceptions:
            FormatFailure on error
        """

        if syms == None:                 # special case generating zero feature set
            self.positive = ellyBits.EllyBits(symbolTable.FMAX)
            self.negative = ellyBits.EllyBits(symbolTable.FMAX)
            return

        segm = fets.lower() if fets != None else '[?]'
#       print "features=",segm,"semantic=",semantic
        if segm == None or len(segm) < 3 or segm[0] != '[' or segm[-1] != ']':
            raise ellyException.FormatFailure
        elif segm[1] == ' ' or ellyChar.isLetterOrDigit(segm[1]):
            raise ellyException.FormatFailure
        else:
            self.id = segm[1]
#           print "id=",self.id
            fsindx = syms.sxindx if not semantic else syms.smindx
            if not self.id in fsindx:
#               print 'new feature set'
                d = { }                  # new dictionary of feature names
                if not semantic:
                    d['*r'] = 0          # always define '*r' as syntactic feature
                    d['*right'] = 0      # equivalent to '*r'
                    d['*l'] = 1          # always define '*l'
                    d['*left']  = 1      # equivalent to '*l'
                    d['*unique'] = LAST  # always define
                fsindx[self.id] = d      #   and save
            fs = syms.getFeatureSet(segm[1:-1] , semantic)
            if fs == None:
                raise ellyException.FormatFailure
            self.positive , self.negative = fs
Пример #29
0
def title ( buffr ):

    """
    recognize double-quoted title in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Tmax: lb = Tmax
    if lb < Tmin: return 0
    qm = buffr[0]
    if qm != aDQ and qm != lDQ: return 0

    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == rDQ:
            break
        if not ellyChar.isUpperCaseLetter(bc): return 0

        while ib < lb:
            bc = buffr[ib]
            ib += 1
            if bc == ' ': break
            if qm == aDQ:
                if bc == aDQ: break
            else:
                if bc == rDQ: break
            if bc in [ '!' , '?' ]:
                return 0
        if bc == rDQ or bc == aDQ: break
    else:
        return 0    # must have enclosing rDQ or aDQ

    if ib < Tmin: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Пример #30
0
def title ( buffr ):

    """
    recognize double-quoted title in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Tmax: lb = Tmax
    if lb < Tmin: return 0
    qm = buffr[0]
    if qm != aDQ and qm != lDQ: return 0

    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == rDQ:
            break
        if not ellyChar.isUpperCaseLetter(bc): return 0

        while ib < lb:
            bc = buffr[ib]
            ib += 1
            if bc == ' ': break
            if qm == aDQ:
                if bc == aDQ: break
            else:
                if bc == rDQ: break
            if bc in [ '!' , '?' ]:
                return 0
        if bc == rDQ or bc == aDQ: break
    else:
        return 0    # must have enclosing rDQ or aDQ

    if ib < Tmin: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Пример #31
0
    def _findAMorPM ( self , ts ):

        """
        look for AM or PM in time expression

        arguments:
            self  -
            ts    - char list

        returns:
            length of string match on success, 0 otherwise
        """

        k = 0                     # for match count
        lt = len(ts)              # maximum match

        if lt < 2:                # minimum number of chars for any match
            return 0
        elif ts[k] == ' ':        # skip over any leading space
            k += 1

#       print 'find AM or PM in' , ts[k:]
        x = ts[k].lower()
        if x != 'a' and x != 'p': # first char in AM or PM
            return 0
        k += 1
        if lt == k:               # end of input check
            return 0
        if ts[k] == '.':          # '.' is optional
            k += 1
        if lt == k:               # end of input check
            return 0
        y = ts[k].lower()
        if y != 'm':              # last char in AM or PM
            return 0
        k += 1
        if lt == k or not ellyChar.isLetterOrDigit(ts[k]): # check for break
            self._xm = x          # save just 'a' or 'p'
            return k              # return match count for success
        else:
            return 0              # for match failure
Пример #32
0
def toIndex ( t ):

    """
    get part of term for vocabulary table indexing that
    ends in alphanumeric or is a single nonalphanumeric

    arguments:
        t  - term as string

    returns:
        count of chars to index
    """

    ln = len(t)                   # number of chars in term
    if ln == 0: return 0
    n = t.find(' ')               # find first part of term
    if n < 0: n = ln              # if indivisible, take everything
    n -= 1                        # find last alphanumeric chars of first part
    while n > 0 and not ellyChar.isLetterOrDigit(t[n]):
        n -= 1
    return n + 1
Пример #33
0
    def getRules ( self , a ):

        """
        get appropriate macros for text starting with specified first char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of macro rules to try out
        """

        if a == '': return [ ]
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            ls = self.index[k] + ws + self.anyWx
        else:
            ls = self.index[0] + self.anyWx
        return ls
Пример #34
0
def scan(strg):
    """
    check for extent of syntax specification

    arguments:
        strg  - string of chars to scan

    returns:
        char count > 0 on finding possible syntax specification, 0 otherwise
    """

    n = 0
    for c in strg:
        if c == '.' or ellyChar.isLetterOrDigit(c): n += 1
        else: break
    else:
        return n
    c = strg[n]
    if c == ' ': return n
    if c != '[': return 0
    k = featureSpecification.scan(strg[n:])
    return n + k if k > 0 else 0
Пример #35
0
def terminate(spc, npc):
    """
    check char for termination of match range
    arguments:
        spc  - current char in stream
        npc  - next char in stream
    returns:
        True if current char terminates, False otherwise
    """

    #   print ( "terminate:" , '<' + spc + '>' , '<' + npc + '>' )
    tm = False
    if spc in EMBs:
        if npc in EMBs:
            tm = True
    elif spc in APOs or ellyChar.isLetterOrDigit(spc):
        pass
    else:
        tm = True


#   print ( 'tm=' , tm )
    return tm
Пример #36
0
def scan ( strg ):

    """
    check for extent of syntax specification

    arguments:
        strg  - string of chars to scan

    returns:
        char count > 0 on finding possible syntax specification, 0 otherwise
    """

    n = 0
    for c in strg:
        if c == '.' or ellyChar.isLetterOrDigit(c): n += 1
        else: break
    else:
        return n
    c = strg[n]
    if c == ' ': return n
    if c != '[': return 0
    k = featureSpecification.scan(strg[n:])
    return n + k if k > 0 else 0
Пример #37
0
    def _scanText ( self , k ):

        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary and pattern tables and also
        running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            vocabulary table record
        """

        sb = self.sbu.buffer           # input buffer
        tr = self.ptr                  # parse tree for results

        rs = drs                       # initialize to empty vocabulary table record
        rs.mtchl = 0                   # maximum match count
        lm = len(sb)                   # scan limit
#*      print 'next component=' , sb[:k] , ', context=' , sb[k:lm]

        vrs = drs                      # initially, set no maximum match
        if self.vtb != None:           # look in external dictionary first, if it exists
            if k > 1:                  # is first component a single char?
                ks = k                 # if not, use this for indexing
            else:
                ks = 1                 # otherwise, add on any following alphanumeric
                while ks < lm:         #
                    if not ellyChar.isLetterOrDigit(sb[ks]):
                        break
                    ks += 1
            ss = u''.join(sb[:ks])     # where to start for indexing
            n = vocabularyTable.toIndex(ss)  # get actual indexing
            vs = self.vtb.lookUp(sb,n) # get list of the longest matches
            if len(vs) > 0:            #
                r = vs[0][1]           # if any matches, look at first
                m = r.mtchl            # all other nominal lengths must be the same!
#*              print len(vs) , 'matching vocabulary entries'
                for v in vs:
                    ve  = v[0]         # get vocabulary entry
                    vrs = v[1]         # result record for match
#                   print 've=' , ve
#                   if ve.gen != None: print ve.gen
                    if tr.addLiteralPhraseWithSemantics(
                           ve.cat,ve.syf,ve.smf,ve.bia,ve.gen):
                        tr.lastph.lens = m   # set char length of leaf phrase node
                                             # just added for later selection
                        tr.lastph.cncp = ve.con
                rs.mtchl = m           # update maximum for new matches
#*      print 'vocabulary m=' , rs.mtchl

        d = self.rul                   # grammar rule definitions

        m = d.ptb.match(sb,tr)         # try entity by pattern match next
#*      print 'pattern m=' , m
        if rs.mtchl < m:
            rs.mtchl = m               # on longer match, update maximum

        m = self.iex.run(sb)           # try entity extractors next
#*      print 'extractor m=' , m
        if rs.mtchl < m:
            rs.mtchl = m               # on longer match, update maximum

#*      print 'maximum match=' , rs.mtchl
#       print 'input=' , self.sbu.buffer

        if rs.mtchl > 0:               # any matches at all?
            nd = tr.requeue()          # if so, keep only longest of them
#           print nd , 'phrases dropped by requeue()'

            if vrs.mtchl == rs.mtchl:  # this a vocabulary match?
                rs = vrs               # if so, use vocabulary match results

        return rs
Пример #38
0
def match ( patn , text , offs=0 , limt=None ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit of matching

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # three private functions using local variables of match()
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        return uf

    def _span ( typw ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
        returns:
            non-negative count if any match possible, otherwise -1
        """
        k = minMatch(patn[mp:])  # calculate min char count to match rest of pattern

#       print "exclude=",k,"@",offs

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # char type matching a wildcard

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match?

    if limt == None: limt = len(text)

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit

#   print text[offs:limt],":",list(patn)

    while True:

        ## literally match as many next chars as possible

        while mp < ml:
            if offs >= limt:
                last = ''
            else:
                last = text[offs].lower()
                offs += 1
#           print 'matching last=' , last , 'at' , offs
            if patn[mp] != last: break
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat',mp,"<",ml
#       print "txt @",offs

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",ord(tc)

        if tc == cALL:   # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1  # get new binding record
            bf[0] = offs              # bind from current offset
            offs += nm                # move offset past end of span
            bf[1] = offs              # bind to   new     offset
#           print "offs=",offs
            uf = _mark(1); unj += 1   # get new unwinding record
            uf.count = nm             # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last in [ '.' , ',' , '-' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:"
            if last != '' and ellyChar.isWhiteSpace(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1   # dummy record to block
            mf[0] = -1                #   later binding consolidation
            if last != '':
                offs -= 1             # try for rematch
            m = mp                    # find corresponding EOS
            while m < ml:             #
                if patn[m] == cEOS: break
                m += 1
            else:                     # no EOS?
                m -= 1                # if so, pretend there is one anyway
            uf = _mark(0); unj += 1   # for unwinding on any later match failure
            uf.pats = m + 1           # i.e. one char past next EOS
            uf.txts = offs            # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1             # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
            if last != '':            # still more to match?
                offs -= 1
                nm = _span(tc)        # maximum match possible
#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
                    continue

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch

#       print "fail - unwinding",unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    return mbd             # consolidated bindings plus new offset
Пример #39
0
    def _scanText(self, k):
        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary and pattern tables and also
        running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , match types , vocabulary match chars , suffix removed ]
        """

        #       print ( '_scanText k=' , k )
        sb = self.sbu.buffer  # input buffer

        # match status
        nspan = 0  #   total span of match
        mtype = ''  #   no match type yet
        vmchs = []  #   chars of vocabulary entry matched
        suffx = ''  #   any suffix removed in match

        lm = len(sb)  # scan limit
        #       print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] )

        if self.vtb != None:  # look in external dictionary first, if it exists
            if k > 1:  # is first component a single char?
                ks = k  # if not, use this for indexing
            else:
                ks = 1  # otherwise, add on any following alphanumeric
                while ks < lm:  #
                    if not ellyChar.isLetterOrDigit(sb[ks]):
                        break
                    ks += 1
            ss = ''.join(sb[:ks])  # where to start for indexing
            #           print ( 'ss=' , ss )
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
            #           print ( 'n=' , n )
            rl = self.vtb.lookUp(sb, n)  # get list of the longest matches
            if len(rl) > 0:  #
                #               print ( 'len(rl)=' , len(rl) )
                r0 = rl[0]  # look at first record
                nspan = r0.nspan  # should be same for all matches
                mtype = 'Vt'
                vmchs = r0.vem.chs  #
                suffx = r0.suffx  #

#       print ( 'vocabulary m=' , nspan )

        d = self.rul  # grammar rule definitions

        m = d.ptb.match(sb, self.ptr)  # try entity by pattern match next
        #       print ( 'pattern m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum
            mtype = 'Fa'
        elif m > 0 and nspan == m:
            mtype = 'VtFa'

#       print ( 'mtype=' , mtype )

        m = self.iex.run(sb)  # try entity extractors next
        #       print ( 'extractor m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum
            mtype = 'Ee'
        elif m > 0 and nspan == m:
            mtype += 'Ee'  # unchanged match length, add type

#       print ( 'maximum match=' , nspan )
#       print ( 'mtype=' , mtype )
#       print ( 'input=' , self.sbu.buffer[:nspan] )

        return [nspan, mtype, vmchs, suffx]
Пример #40
0
    def getNext(self):
        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

        #       print ( 'getNext' )

        self.resetBracketing()
        inBrkt = False

        nspc = 0  # set space count

        sent = []  # list buffer to fill

        x = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:  # EOF check
            return None

        c = END  # reset
        lc = END

        #       print ( 'x=' , '<' + x + '>' , ord(x) )
        self.inp.unread(x, SP)  # put first char back to restore input
        #       print ( '0  <<' , self.inp.buf )

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0  # alphanumeric count in sentence

        while True:

            x = self.inp.read()  # next input char

            if x == END:  # handle any EOF
                break

#           print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' )
#           print ( 'sent=' , sent , 'nspc=' , nspc )

# check for table delimiters in text

            if len(sent) == 0:
                #               print ( 'table' )
                #               print ( '1  <<' , self.inp.buf )

                if x == '.' or x == '-':  # look for multiple '.' or '-'
                    while True:  # scan up to end of current buffering
                        y = self.inp.read()  #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break  #
                    continue  # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

            #           print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' )
            if lc == SP or lc == END:  # normalize chars for proper bracketing
                if x == SQuo:  #
                    x = LSQm  # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:  #
                    x = LDQm  # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END:  #
                if x == SQuo:  # a SQuo followed by a space becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by a space becomes RDQm
                    x = RDQm  #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:  # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm  #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(
                x)  # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

            #           print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt )

            sent.append(c)  # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue  # if alphanumeric, just add to sentence

            if c == SP:
                continue  # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()  # remove from sentence chars
                break

            # certain Unicode punctuation will always break

            if c in Hards:
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

            #           print ( '0  <<' , self.inp.buf )

            #           print ( 'sent=' , sent[:-1] )
            #           print ( 'punc=' , '<' + c + '>' )
            #           print ( 'next=' , cx )
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1], c, cx):
                    #                   print ( 'stop exception MATCH' )
                    if self.drop:
                        sent.pop()  # remove punctuation char from sentence
                        lc = SP
                    continue

#           print ( 'no stop exception MATCH for' , c )

#           print ( '@1  <<' , self.inp.buf )

# handle any nonstandard punctuation

            exoticPunctuation.normalize(c, self.inp)

            #           print ( '@2  <<' , self.inp.buf )

            # check for dash

            if c == '-':
                d = self.inp.read()
                if d == '-':
                    #                   print ( 'dash' )
                    while True:
                        d = self.inp.read()
                        if d != '-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print ( '@3  c=' , c , inBrkt )

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

                #               print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) )

                if not inBrkt:
                    #                   print ( sent , 'so far' )
                    z = self.inp.read()
                    if self.shortBracketing(sent, z):
                        break
                    self.inp.unread(z)
                    #                   print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' )
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
                    #                   print ( 'stop+quote' )
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
                        #                       print ( 'stop+quote+quote' )
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print ( 'continue' )
                continue

            elif not c in Stops:
                continue

            else:
                #               print ( 'check stopping!' )
                d = self.inp.read()
                #               print ( '@3  <<' , self.inp.buf )

                if d == None: d = '!'
                #               print ( 'stop=' , '<' + c + '> <' + d + '>' )

                #               print ( 'ellipsis check' )
                if c == '.' and c == d:
                    if self.inp.peek() != c:  # look for third '.' in ellipsis
                        self.inp.unread(d)  # if none, keep only first '.'
                    else:
                        self.inp.skip()  # found ellipsis
                        sent.append(d)  # complete it in sentence buffer
                        sent.append(d)  #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(
                                SP
                            )  # if part of token, put in space as separator
                    continue

                if c == ELLP:
                    #                   print ( 'found Unicode ellipsis, d=' , d )
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(
                            d)  # super special case of bad punctuation
                        self.inp.unread(' ')  # put in implied period and space
                        self.inp.unread('.')  #

                # special check for multiple stops

#               print ( 'next char d=' , d , ord(d) if d != END else 'NONE' )
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP  # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent, d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
                    #                   print ( 'no space after punc' )
                    continue

                # if no match for lookahead, put back

                elif d != END:
                    #                   print ( 'unread d=' , d )
                    self.inp.unread(d)

#               print ( 'possible stop' )

# check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
                    #                   print ( 'sent=' , sent )
                    #                   print ( 'ixn=' ,ixn )
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
                        #                       print ( 'cxn=' , cxn )
                        if not ellyChar.isDigit(cxn): break
#                   print ( 'break: ixn=' , ixn , 'ixb=' , ixb )
                    if ixn < ixb and cxn in [' ', '-', '+']:
                        prvw = self.inp.preview()
                        #                       print ( 'prvw=' , prvw )
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(
                                prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
                    #                   print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() )
                    #                   print ( 'nspc=' , nspc )
                    if c in [':', ';'] or nspc < 3:
                        sent.append(d)
                        #                       print ( 'add' , '<' + d + '> to sentence' )
                        #                       print ( 'sent=' , sent )
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print ( '@4  <<' , self.inp.buf )
                cx = self.inp.peek()
                if cx == None: cx = '!!'
                #               print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent )
                #               print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt )
                if nAN > 1:
                    break

        if sent == ['\u2026']:  # special case of sentence
            return list("-.-")  # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None
Пример #41
0
    def __init__ ( self , syms , spec ):

        """
        initialization from input string and symbol table

        arguments:
            self  -
            syms  - current symbol table
            spec  - input string

        exceptions:
            FormatFailure on error
        """

        self.catg = -1    # values to set on an error
        self.synf = None  #

#       print >> sys.stderr , 'specification=' , spec
        if spec == None:
            print >> sys.stderr , '** null syntax specification'
            raise ellyException.FormatFailure

        s = spec.lower()  # must be lower case for all lookups

        n = 0
        for c in s:
            if not ellyChar.isLetterOrDigit(c) and c != '.': break
            n += 1

        if n == 0:
            print >> sys.stderr , '** no syntactic category'
            raise ellyException.FormatFailure

        typs = s[:n]        # save category name
#       print >> sys.stderr , 'catg=' , self.catg
        catg = syms.getSyntaxTypeIndexNumber(typs)
        if catg == None:
            raise ellyException.FormatFailure

        s = s[n:].strip()   # feature part of syntax

        if len(s) == 0:     # check if there are any features
            synf = featureSpecification.FeatureSpecification(syms,None)
            if typs == '...':
                synf.id = '...'
        elif typs == '...': # ... category may have no features!
            raise ellyException.FormatFailure
        else:               # decode features
#           print >> sys.stderr , 'syms=' , syms , 's=' , s
            if len(s) < 4:
                print >> sys.stderr , '** bad syntactic type or missing features= ' , typs+s
                raise ellyException.FormatFailure
            if typs in catid and catid[typs] != s[1]:
                print >> sys.stderr , '** type' , typs.upper() , 'has two feature IDs:' , catid[typs] , s[1]
                raise ellyException.FormatFailure
            catid[typs] = s[1]
            synf = featureSpecification.FeatureSpecification(syms,s)

        # FormatFailure exception may be raised above, but will not be caught here

#       print >> sys.stderr , 'success'
        self.catg = catg
        self.synf = synf
Пример #42
0
    def match ( self , txt , pnc , nxt ):

        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars up to and including punctuation char
            pnc   - punctuation char
            nxt   - single char after punctuation

        returns:
            True on match, False otherwise
        """

#       print 'matching for txt=' , txt , 'pnc=' , pnc , 'nxt=' , nxt

#       print 'lstg=' , self.lstg
        if not pnc in self.lstg:  # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

#       print len(lp) , 'patterns'

        txl = txt[-self.maxl:] if len(txt) > self.maxl else txt

        txs = map(lambda x: x.lower(),txl) # actual left context for matching

        lt = len(txs)             # its length

#       print 'txs= ' + unicode(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']'

        for p in lp:              # try matching each pattern

            if p.left != None:

                n = len(p.left)   # assume each pattern element must match one sequence char
#               print 'n=' , n , 'p=' , unicode(p)
                if n > lt:
                    continue      # fail immediately because of impossibility of match
                t = txs if n == lt else txs[-n:]
#               print 'left pat=' , '[' + ellyWildcard.deconvert(p.left) + ']'
#               print 'versus t=' , t
                if not ellyWildcard.match(p.left,t,0):
#                   print 'no left match'
                    continue
                if n < lt and ellyChar.isLetterOrDigit(t[0]):
                    if ellyChar.isLetterOrDigit(txs[-n-1]):
                        continue  # fail because of no break in text

#           nc = '\\n' if nxt == '\n' else nxt
#           print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']'
#           print 'versus c=' , nc

            if p.right == []:
                return True
            pcx = p.right[0]
            if pcx == nxt:                     # check for specific char after possible stop
#               print 'right=' , nxt
                return True
            if pcx == ellyWildcard.cCAN:       # check for nonalphanumeric
                if nxt == u'' or not ellyChar.isLetterOrDigit(nxt):
#                   print 'right nonalphanumeric=' , nxt
                    return True
            if pcx == ellyWildcard.cSPC:       # check for white space
#               print 'looking for space'
                if nxt == u'' or nxt == u' ' or nxt == u'\n':
#                   print 'right space'
                    return True
#           print 'last check'
            if p.right == u'.':                # check for any punctuation
                if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt):
#                   print 'right punc=' , nxt
                    return True

        return False
Пример #43
0
    def _store ( self , defs , nowarn ):

        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

        while True:
            l = defs.readline()               # next macro rule
#           print "rule input=" , l
            if len(l) == 0: break             # EOF check
            dl = definitionLine.DefinitionLine(l,False)
            left = dl.left                    # pattern to be matched
            tail = dl.tail                    # transformation to apply to match
            if left == None or tail == None:
                self._err(l=l)
                continue
            mp = ellyWildcard.convert(left)
            if mp == None:
                self._err('bad wildcards',l)
                continue
            pe = mp[-1]
            if pe != ellyWildcard.cALL and pe != ellyWildcard.cEND:
                mp += ellyWildcard.cEND       # pattern must end in $ if it does not end in *
            if not _checkBindings(mp,tail):
                self._err('bad bindings in substitution',l)
                continue
            if not nowarn and not _checkExpansion(mp,tail):
                self._err('substitution longer than original string',l,0)
            r = [ mp , tail ]
#           print "rule =" , [ left , tail ]
            pat = r[0]                        # get coded pattern
            if pat == None:
                self._err('no pattern',l)
                continue
            c = pat[0]                        # first char of pattern
                                              # check type to see how to index rule
#           print 'c=' , ord(c)
            p = pat
            while c == ellyWildcard.cSOS:     # optional sequence?
                k = p.find(ellyWildcard.cEOS) # if so, find the end of sequence
                if k < 0 or k == 1: break     # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break       # should be something after sequence
                m = ellyChar.toIndex(pat[1])  # index by first char of optional sequence
                self.index[m].append(r)       #   (must be non-wildcard)
                p = p[k:]                     # move up in pattern
                c = p[0]                      #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue                      # bad sequence, skip this rule

#           print 'c=' , ord(c)
            if ellyChar.isLetterOrDigit(c):   # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)       # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)       # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print >> sys.stderr , '** macro warning: pattern can have empty match'
                    print >> sys.stderr , '*  at [' , l , ']'
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code' , dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)          # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)       # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)          # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)          # under both digit and
                self.letWx.append(r)          #   letter wildcards
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context',l)
                continue                      # wildcards unacceptable here
            else:
                self.letWx.append(r)          # everything else under letter wildcard

            self.count += 1                   # count up macro substitution

        if self._errcount > 0:
            print >> sys.stderr , '**' , self._errcount , 'macro errors in all'
            print >> sys.stderr , 'macro table definition FAILed'
            raise ellyException.TableFailure
Пример #44
0
    def _getRaw(self):
        """
        obtain next raw token from buffer

        arguments:
            self

        returns:
            EllyToken on success, None otherwise
        """

        #       print ( '_getRaw() from' , len(self.buffer) , 'chars' )
        #       print ( 'before skipping spaces, buffer=' , self.buffer )
        self.skipSpaces()
        ln = len(self.buffer)
        #       print ( "after skip=",ln )
        if ln == 0:
            return None

        ## get length of next token and if it has
        ## initial - or +, check for word fragment

#       print ( 'buffer start=' , self.buffer[0] )

        k = 0  # number of chars for next token

        cz = ' ' if ln == 0 else self.buffer[0]
        if cz in [MIN, PLS]:
            k = self.findSeparator(1)
        elif cz == APO:
            if ln > 2 and self.buffer[1].lower(
            ) == 's' and self.buffer[2] in separators:
                k = 2
            else:
                k = 1
        elif cz in [COM, DOT, UELP]:  # these can be tokens by themselves
            k = 1
        else:
            #           print ( 'full token extraction' )
            k = self.findSeparator()
            #           print ( 'k=' , k , 'ln=' , ln )
            if k < 0:  # break multi-char token at next separator
                k = ln  # if no separator, go up to end of buffer
            elif k == 0:
                k = 1  # immediate break in scanning
            else:
                while k < ln:  # look at any separator and following context
                    x = self.buffer[k]
                    if x != MIN and x != COM:
                        break  # no further check if separator not hyphen or comma
                    if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k + 1]):
                        #                       print ( 'x=' , x , 'buf=' , self.buffer[k:] )
                        break  # accept hyphen or comma if NOT followed by digit
                    else:  # otherwise, look for another separator
                        k = self.findSeparator(k + 2)
                        if k < 0:  #
                            k = ln

        ## if token not delimited, take rest of buffer as
        ## will fit into token working area

        if k < 0: k = ln

        #       print ( "take",k,"chars from",len(self.buffer),self.buffer )

        buf = self.extract(k)  # get k characters

        ## special check for hyphen next in buffer after extraction

        if self.match(MIN):  # hyphen immediately following?
            self.skip()  # if so, take it
            if self.atSpace():  # when followed by space
                buf.append(MIN)  # append hyphen to candidate token
                k += 1
            else:
                if not self.match(MIN):  # when not followed by another hyphen
                    self.prepend(ellyChar.SPC)  # put back a space
                else:
                    self.skip()  # double hyphen = dash
                    self.prepend(ellyChar.SPC)  # put back space after dash
                    self.prepend(MIN)  # put back second hyphen
                self.prepend(MIN)  # put back first
                self.prepend(
                    ellyChar.SPC)  # put extra space before hyphen or dash

        ## fill preallocated token for current position from working area

#       print ( "raw text buf=" , buf )

        to = ellyToken.EllyToken(''.join(buf))

        #       print ( "EllyBuffer token before=" , str(to) )

        ## strip off trailing non-token chars from token and put back in buffer

        km = k - 1
        while km > 0:
            x = buf[km]
            if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS:
                break
#           print ( 'trailing x=' , x )
            if x == APO or x == APX:
                if km > 0 and buf[km - 1] == 's':
                    break
            self.prepend(x)
            km -= 1
        km += 1
        if km < k:
            to.shortenBy(k - km, both=True)

#       print ( "EllyBuffer token=" , strx(to) )
#       print ( "next in buffer=" , self.buffer )
        return to
Пример #45
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset
Пример #46
0
    def getFeatureSet ( self , fs , ty=False ):

        """
        get feature indices associated with given names in given set

        arguments:
            self  -
            fs    - feature set without enclosing brackets
            ty    - False=syntactic, True=semantic

        returns:
            list of EllyBits [ positive , negative ] on success, None on failure
        """

        if len(fs) < 1: return None
#       print ( 'fs=' , fs )

        bp = ellyBits.EllyBits(FMAX) # all feature bits zeroed
        bn = ellyBits.EllyBits(FMAX) #

        fsx = self.smindx if ty else self.sxindx
#       print ( '--------  fs=' , fs )
        fid = fs[0]                  # feature set ID
        fnm = fs[1:].split(',')      # feature names
        if not fid in fsx:           # known ID?
#           print ( 'new feature set' )
            d = { }                  # new dictionary of feature names
            if ty:
                d['*c'] = 0          # always define '*c' as semantic  feature
                d['*capital'] = 0    # equivalent to '*c'
            else:
                d['*r'] = 0          # always define '*r' as syntactic feature
                d['*right'] = 0      # equivalent to '*r'
                d['*l'] = 1          # always define '*l'
                d['*left']  = 1      # equivalent to '*l'
                d['*x'] = LAST       # always define '*x'
                d['*u'] = LAST       # always define '*u'
                d['*unique'] = LAST  # equivalent to '*u' and '*x'
            fsx[fid] = d             # make new feature set known
        h = fsx[fid]                 # for hashing of feature names
        if len(fnm) == 0:            # check for empty features
            return [ bp , bn ]
        for nm in fnm:
            nm = nm.strip()
            if len(nm) == 0: continue
            if nm[0] == '-':         # negative feature?
                b = bn               # if so, look at negative bits
                nm = nm[1:]
            elif nm[0] == '+':       # positive feature?
                b = bp               # if so, look at positive bits
                nm = nm[1:]
            else:
                b = bp               # positive bits by default

#           print ( '--------  nm=' , nm )
            nmc = nm if nm[0] != '*' else nm[1:]
            for c in nmc:            # check feature name chars
                if not ellyChar.isLetterOrDigit(c):
                    print ( 'bad feature name=' , nm , file=sys.stderr )
                    return None
            if not nm in h:          # new name in feature set?
                if nm[0] == '*':     # user cannot define reserved name
                    print ( 'unknown reserved feature=' , nm , file=sys.stderr )
                    return None
#               print ( 'define new feature' )
                k = len(h)           # yes, this will be next free index
                l = FMAX             # upper limit on feature index
                if ty:               # semantic feature?
                    k -= 1           # if so, adjust for extra name *C
                else:
                    k -= 5           # else,  adjust for *UNIQUE and extra names *L, *R , *U , *X
                    l -= 1           #        adjust upper limit for *UNIQUE
                if k == l:           # overflow check
                    print ( '** ERROR: too many feature names, fid=',fid,'nm=',nm , file=sys.stderr )
                    print ( '**' , end=' ' , file=sys.stderr )
                    print ( h.keys() , file=sys.stderr )
                    return None
                if k < 0:
                    print ( 'bad index=' , k , 'l=' , l , file=sys.stderr )
                    return None
                h[nm] = k            # define new feature
#               print ( 'k=' , k )

#           print ( 'set bit' , h[nm] , 'for' , fid + nm )
            b.set(h[nm])             # set bit for feature
        return [ bp , bn ]
Пример #47
0
    def match(self, segm, tree):
        """
        compare text segment against all FSA patterns from state 0

        arguments:
            self  -
            segm  - segment to match against
            tree  - parse tree in which to put leaf nodes for final matches

        returns:
            text length matched by FSA
        """

        #       print 'comparing' , segm

        if len(self.indx) == 0: return 0  # no matches if FSA is empty

        if len(segm) == 0: return 0  # string is empty

        lim = bound(segm)  # get text limit for matching

        mtl = 0  # accumulated match length
        mtls = 0  # saved final match length

        state = 0  # set to mandatory initial state for FSA

        stk = []  # for tracking possible multiple matches

        ls = self.indx[state]  # for state 0!
        ix = 0  # index into current possible transitions
        sg = segm[:lim]  # text subsegment for matching
        #       print 'initial sg=' , sg
        #       print len(ls) , 'transitions from state 0'
        capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0])

        while True:  # run FSA to find all possible matches
            #           print 'state=' , state
            #           print 'count=' , mtl , 'matched so far'
            #           print 'links=' , len(ls) , 'ix=' , ix
            nls = len(ls)  # how many links from current state

            if ix == nls:  # if none, then must back up
                if len(stk) == 0: break
                r = stk.pop()  # restore match status
                #               print 'pop r= [' , r[0] , r[1][0].shortcode() , ']'
                state = r[0]  # FSA state
                ls = r[1]  # remaining links to check
                sg = r[2]  # input string
                mtl = r[3]  # total match length
                ix = 0
                #               print 'pop sg=' , sg
                continue

#           print 'substring to match, sg=' , sg , 'nls=' , nls
            m = 0
            while ix < nls:
                lk = ls[ix]  # get next link at current state
                ix += 1  # and increment link index
                #               print '@' , state , 'lk= [' , unicode(lk), ']' , 'ix=' , ix
                #               print 'patn=' , lk.patn
                po = lk.patn[0]
                if po == u'\x00':  # do state change without matching?
                    m = 0  # no match length
                elif po != ellyWildcard.cEND:
                    #                   print 'po=' , po
                    bds = ellyWildcard.match(lk.patn, sg)
                    #                   print 'bds=' , bds
                    if bds == None: continue
                    m = bds[0]  # get match length, ignore wildcard bindings
                elif (len(sg) > 0 and (ellyChar.isLetterOrDigit(sg[0])
                                       or sg[0] == ellyChar.PRME)):
                    #                   print 'unmatched solitary $'
                    continue
                else:
                    #                   print 'matched solitary $, state=' , state
                    m = 0

#               print 'm=' , m

                if lk.nxts < 0:  # final state?
                    if lk.nxts == -2: m = 0  # last part of match not counted
                    #                   print 'state=' , state , unicode(lk)
                    #                   print 'flags=' , lk.synf , '/' , lk.semf
                    if tree.addLiteralPhraseWithSemantics(
                            lk.catg, lk.synf, lk.semf, lk.bias,
                            cap=capd):  # make phrase
                        ml = mtl + m
                        if mtls < ml: mtls = ml
                        #                       print 'success!' , 'mtls=' , mtls
                        tree.lastph.lens = mtls  # save its length
#                       print 'match state=' , state , 'length=' , mtls

#               print 'ix=' , ix , 'nls=' , nls
                if ix < nls:  # any links not yet checked?
                    r = [state, ls[ix:], sg, mtl]
                    #                   print 'saved r= ' , state ,
                    #                   print [ x.shortcode() for x in ls[ix:] ]
                    stk.append(r)  # if not, save info for later continuation

                mtl += m  # update match length
                break  # leave loop at this state, go to next state
            else:
                #               print 'no matches'
                continue  # all patterns exhausted for state

            ix = 0
            sg = sg[m:]  # move up in text input
            state = lk.nxts  # next state
            if state < 0:
                ls = []
            else:
                ls = self.indx[state]
#           print 'sg=' , sg
#           print 'state=' , state
#           print 'len(ls)=' , len(ls)

#       print 'mtls=' , mtls
        return mtls
Пример #48
0
    def rewrite ( self , ts ):

        """
        check for date at current text position and rewrite if found

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            True on any rewriting, False otherwise
        """

        lts = len(ts)
        if lts < Lm: return False

        tz = self._tz      # default

        self._xm = ''      # default

        self._m = u'00'    # defaults
        self._s = u'00'

        c = ts[0]          # first char
        if not ellyChar.isDigit(c):
            return False   # time can never start with a letter
                           # because of number transforms

        k = self._matchN(ts)
#       print 'match numeric=' , k

        if k == 0: return False

#       print 'ts[k:]=' , ts[k:]
        k += self._findAMorPM(ts[k:])
#       print 'AM or PM k=' , k

#       print 'hour=' , self._hr
        if   self._xm == 'p' and self._hr <  12: # convert to 24-hour time
            self._hr += 12
        elif self._xm == 'a' and self._hr == 12: #
            self._hr = 0
#       print 'hour=' , self._hr

        t = ts[k:]                 # remainder of text
#       print 'rest t=' , t
        dk = 0                     # skip count
        ns = 0                     # space count
        if len(t) > 0:             # look for time zone
            if t[0] == ' ':        # skip any initial space
                dk += 1
                ns = 1
#           print 't[dk:]=' , t[dk:] , 'dk=' , dk
            dk += self.get(t[dk:]) # extract next token from input
            ss = self.string       #
#           print 'zone=' , ss
            if ss in Zn:           # match to known time zone?
                tz = ss
            elif ns == 0 and ss == u'z': # military ZULU time
                tz = u'gmt'        # translate
            else:
                dk = 0             # no match

        k += dk                    # update match count
        t = t[dk:]                 # advance scan

#       print 't=' , t
        if len(t) > 0 and ellyChar.isLetterOrDigit(t[0]): return False

        for _ in range(k):         # strip matched substring to be rewritten
            ts.pop(0)

        r  = str(self._hr).zfill(2) + u':' + self._m + u':' + self._s + tz
        rr = r[::-1]
        for c in rr:               # do rewriting
            ts.insert(0,c)
        self._rwl = len(r)
        return True
Пример #49
0
    def match(self, txt, pnc, ctx):
        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars leading up to punctuation char
            pnc   - punctuation char
            ctx   - next chars after punctuation

        returns:
            True on match, False otherwise
        """

        #       print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx )

        if matchtoo(txt, pnc, ctx):  # exception by complex match?
            return True
#       print ( 'matchtoo() returned False' )

        sep = ctx[0] if len(ctx) > 0 else ''
        if sep == ellyChar.THS:
            return True
        nxt = ctx[1] if len(ctx) > 1 else ''

        #       print ( 'lstg=' , self.lstg.keys() )
        if not pnc in self.lstg:  # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

        #       print ( len(lp) , 'patterns' )

        ltx = len(txt)  # current length of accumulated text so far
        ntr = 1
        while ntr <= ltx:
            if not ellyChar.isLetterOrDigit(txt[-ntr]):
                break
            ntr += 1
        nrg = ntr
        ntr -= 1  # available trailing chars for  wildcard * match

        while nrg <= ltx:
            c = txt[-nrg]
            if not ellyChar.isLetterOrDigit(
                    c) and not ellyChar.isEmbeddedCombining(c):
                #               print ( 'break at nrg=' , nrg , txt[-nrg] )
                break
            nrg += 1
        nrg -= 1  # end of range for all pattern matching

        #       print ( 'ntr=' , ntr , 'nrg=' , nrg )

        txt = txt[-nrg:]  # reset text to limit for matching
        ltx = len(txt)  # its new length

        #       print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' )

        for p in lp:  # try matching each listed exception pattern

            if p.left != None and len(p.left) > 0:

                pat = p.left
                star = pat[-1] == ellyWildcard.cALL
                n = len(
                    pat)  # it each pattern element matches one sequence char
                if star:  # except for a final wildcard *
                    #                   print ( 'pattern ending with *' )
                    n -= 1
                    #                   print ( 'ltx=' , ltx , 'n=' , n )
                    if ltx < n:
                        continue  # cannot match pattern properly
                    pat = pat[:-1]
                    t = txt[:n]
                else:
                    if ltx < n:
                        continue  # cannot match pattern properly
                    t = txt[-n:]

                if not ellyWildcard.match(pat, t, 0):
                    #                   print ( 'no possible pattern match' )
                    continue

                k = ltx - n  # extra chars beyond any match
                #               print ( 'k=' , k , 't=' , t )
                #               print ( 'txt=' , txt )
                #               print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' )
                #               print ( 'matches' , n , 'chars' )
                if not star and k > 0:
                    #                   print ( 'check text before [' , txt[-n] , ']' )
                    if ellyChar.isLetterOrDigit(txt[-n]):
                        c = txt[-n - 1]
                        #                       print ( 'preceding= [', c , ']' )
                        if ellyChar.isLetterOrDigit(c) or c == '&':
                            continue  # because break in text is required

#           print ( 'pat=' , ellyWildcard.deconvert(p.left) )
#           print ( 'n=' , n , 'ltx=' , ltx )
#           print ( 'txt=' , txt )

#           nc = '\\n' if nxt == '\n' else nxt
#           print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' )
#           print ( 'versus c=' , nc )

            rp = p.right
            if rp == [] or rp[0] == ellyWildcard.cALL:
                return True
            pcx = rp[0]
            if pcx == nxt:  # check for specific char after possible stop )
                #               print ( 'right=' , nxt )
                return True
            elif pcx == ellyWildcard.cALF:  # check for alphabetic
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True
            elif pcx == ellyWildcard.cDIG:  # check for numeric
                if ellyChar.isDigit(nxt):
                    #                   print ( 'right is numeric=' , nxt 0
                    return True
            elif pcx == ellyWildcard.cUPR:  # check for upper case
                if ellyChar.isUpperCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cLWR:  # check for lower case
                if ellyChar.isLowerCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cCAN:  # check for non-alphanumeric
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True

#       print ( "no matches" )
        return False
Пример #50
0
    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        self.resetBracketing()

        sent = [ ]         # list buffer to fill

        x  = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:       # EOF check
            return None

        c  = END           # reset
        lc = END

#       print 'x=' , '<' + x + '>' , ord(x)
        self.inp.unread(x,SP)       # put first char back to restore input
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0                     # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>'
#           print 'sent=' , sent

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            #########################################
            # accumulate chars and count alphanumeric
            #########################################

            lc = c
            c  = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

#           print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>'
            if lc == SP or lc == END: # normalize chars for proper bracketing
                if x == SQuo:         #
                    x = LSQm          # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:       #
                    x = LDQm          # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END: #
                if x == SQuo:         # a SQuo followed by a space becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by a space becomes RDQm
                    x = RDQm          #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:         # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm          #

            inBrkt = self.checkBracketing(x)    # do bracket checking with modified chars

#           print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt

            sent.append(c)                      # but buffer original chars
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            z = self.inp.peek()  # for context of match call

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , '<' + z + '>'
            if c in Stops and self.stpx.match(sent[:-1],c,z):
#               print 'exception MATCH'
                if self.drop:
                    sent.pop()   # remove punctuation char from sentence
                    lc = SP
                continue

#           print 'no stop exception MATCH for' , c

#           print '@1  <<' , self.inp.buf

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '@2  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print '@3  c=' , c

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

#               print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent)

                if not inBrkt:
#                   print sent , 'so far'
                    z = self.inp.read()
                    if self.shortBracketing(sent,z):
                        break
                    self.inp.unread(z)
#                   print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']'
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                continue

            elif not c in Stops or inBrkt:
                continue

            else:
#               print 'check stopping!'
                d = self.inp.read()
#               print '@3  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(c)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(SP)  # if part of token, put in space as separator
                    continue

                # special check for multiple stops

#               print 'next char d=' , d , ord(d) if d != END else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP               # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent,d): break
                    self.inp.unread(d)
#                   print 'no space after punc'
                    continue

                # if no match for lookahead, put back

                elif d != END:
#                   print 'unread d=' , d
                    self.inp.unread(d)

                # final check: is sentence long enough?

#               print '@4  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
                if nAN > 1 and not inBrkt:
                    break

        if len(sent) > 0 or self.last != END:
            return sent
        else:
            return None
Пример #51
0
    def _store(self, defs, nowarn):
        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

        #       print ( defs.linecount() , 'lines' )
        while True:
            l = defs.readline()  # next macro rule
            #           print ( "rule input=" , l )
            if len(l) == 0: break  # EOF check
            dl = definitionLine.DefinitionLine(l, False)
            left = dl.left  # pattern to be matched
            tail = dl.tail  # transformation to apply to match
            #           print ( 'dl.left=' , left )
            if left == None or tail == None:
                self._err(l=l)  # report missing part of rule
                continue
            if left.find(' ') >= 0:  # pattern side of macro rule
                ms = 'pattern in macro contains spaces'
                self._err(s=ms, l=l, d=1)  # cannot contain any space chars
                continue

            lefts = list(left)
            #           print ( 'left=' , lefts )
            nspm = ellyWildcard.numSpaces(lefts)
            pat = ellyWildcard.convert(
                left)  # get pattern with encoded wildcards
            if pat == None:
                self._err('bad wildcards', l)
                continue
#           print ( 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat) )
#           print ( 'pat=' , list(pat) )
            pe = pat[-1]
            if not pe in [
                    ellyWildcard.cALL, ellyWildcard.cEND, ellyWildcard.cSPC
            ]:
                pat += ellyWildcard.cEND  # pattern must end in $ if it does not end in * or _
            if not _checkBindings(pat, tail):
                self._err('bad bindings in substitution', l)
                continue
            if not nowarn and not _checkExpansion(pat, tail):
                self._err('substitution may be longer than original string', l,
                          0)

#           print ( "rule =" , [ left , nspm , tail ] )
            if pat == None:
                self._err('no pattern', l)
                continue

            r = Rule(pat, nspm, tail)

            c = pat[0]  # first char of pattern
            # check type to see how to index rule
            #           print ( 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat) )
            p = pat
            while c == ellyWildcard.cSOS:  # optional sequence?
                if not cEOS in p:
                    break
                k = p.index(cEOS)  # if so, find the end of sequence
                if k < 0 or k == 1: break  # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break  # should be something after sequence
                m = ellyChar.toIndex(
                    pat[1])  # index by first char of optional sequence
                self.index[m].append(r)  #   (must be non-wildcard)
                p = p[k:]  # move up in pattern
                c = p[0]  #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue  # bad sequence, skip this rule

#           print ( 'c=' , ord(c) )
            if ellyChar.isLetterOrDigit(
                    c):  # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)  # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)  # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print('** macro warning: pattern can have empty match',
                          file=sys.stderr)
                    print('*  at [', l, ']', file=sys.stderr)
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code', dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)  # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)  # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)  # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)  # under both digit and
                self.letWx.append(r)  #   letter wildcards
            elif c == ellyWildcard.cAPO:  # right single quote or apostrophe
                self.apoWx.append(r)  #
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context', l)
                continue  # wildcards unacceptable here
            else:
                self.letWx.append(r)  # everything else under letter wildcard

            self.count += 1  # count up macro substitution


#           print ( 'count=' , self.count )

        if self._errcount > 0:
            print(self._errcount, 'macro errors in all', file=sys.stderr)
            print('macro table definition FAILed', file=sys.stderr)
            raise ellyException.TableFailure
Пример #52
0
    def _store ( self , defs , nowarn ):

        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

#       print defs.linecount() , 'lines'
        while True:
            l = defs.readline()               # next macro rule
#           print "rule input=" , l
            if len(l) == 0: break             # EOF check
            dl = definitionLine.DefinitionLine(l,False)
            left = dl.left                    # pattern to be matched
            tail = dl.tail                    # transformation to apply to match
#           print 'dl.left=' , left
            if left == None or tail == None:
                self._err(l=l)                # report missing part of rule
                continue
            if left.find(' ') >= 0:           # pattern side of macro rule
                ms = 'pattern in macro contains spaces'
                self._err(s=ms,l=l,d=1)       # cannot contain any space chars
                continue

            lefts = list(left)
#           print 'left=' , lefts
            nspm = ellyWildcard.numSpaces(lefts)
            pat = ellyWildcard.convert(left)  # get pattern with encoded wildcards
            if pat == None:
                self._err('bad wildcards',l)
                continue
#           print 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat)
#           print 'pat=' , list(pat)
            pe = pat[-1]
            if not pe in [ ellyWildcard.cALL , ellyWildcard.cEND , ellyWildcard.cSPC ]:
                pat += ellyWildcard.cEND      # pattern must end in $ if it does not end in * or _
            if not _checkBindings(pat,tail):
                self._err('bad bindings in substitution',l)
                continue
            if not nowarn and not _checkExpansion(pat,tail):
                self._err('substitution may be longer than original string',l,0)

#           print "rule =" , [ left , nspm , tail ]
            if pat == None:
                self._err('no pattern',l)
                continue

            r = Rule( pat , nspm , tail )

            c = pat[0]                        # first char of pattern
                                              # check type to see how to index rule
#           print 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat)
            p = pat
            while c == ellyWildcard.cSOS:     # optional sequence?
                if not cEOS in p:
                    break
                k = p.index(cEOS)             # if so, find the end of sequence
                if k < 0 or k == 1: break     # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break       # should be something after sequence
                m = ellyChar.toIndex(pat[1])  # index by first char of optional sequence
                self.index[m].append(r)       #   (must be non-wildcard)
                p = p[k:]                     # move up in pattern
                c = p[0]                      #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue                      # bad sequence, skip this rule

#           print 'c=' , ord(c)
            if ellyChar.isLetterOrDigit(c):   # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)       # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)       # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print >> sys.stderr , '** macro warning: pattern can have empty match'
                    print >> sys.stderr , '*  at [' , l , ']'
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code' , dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)          # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)       # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)          # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)          # under both digit and
                self.letWx.append(r)          #   letter wildcards
            elif c == ellyWildcard.cAPO:      # right single quote or apostrophe
                self.apoWx.append(r)          #
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context',l)
                continue                      # wildcards unacceptable here
            else:
                self.letWx.append(r)          # everything else under letter wildcard

            self.count += 1                   # count up macro substitution
#           print 'count=' , self.count

        if self._errcount > 0:
            print >> sys.stderr , '**' , self._errcount , 'macro errors in all'
            print >> sys.stderr , 'macro table definition FAILed'
            raise ellyException.TableFailure
Пример #53
0
        [ '-' , "'" , ' ' ] ,
        [ ':' , '-' , ')' ] ,       # emoticon
        [ 'x' , 'x' , 'x' , 'x' , '.' , ' ' , 'Y' ] ,
        [ ' ' , '.' , ' ' ] ,
        [ ' ' , 'm', '.' , ' ' , 'm' , 'o' , 'r' , 'r' , 'e' , 'l' , SQW , 's' , ' ' , 's' , 'a' , 'l' ]
    ]

    nlu = len(sys.argv) - 2
    if nlu > 0:                     # add to test cases?
        for a in sys.argv[2:]:
            test.append(list(a.decode('utf8')))
        print 'added' , nlu , 'test case' + ('' if nlu == 1 else 's')
    else:
        print 'no added test cases'
    print '--------'
    print len(test) , 'cases in all'

    for ts in test:
        ku = 0
        lu = len(ts)
        for cu in ts:             # scan input line
            ku += 1
            if cu in stpx.lstg:   # find first candidate stop
                if ku == lu or not ellyChar.isLetterOrDigit(ts[ku]):
                    break         # must not be followed by letter or digit
        else:
            continue
        res = stpx.match( ts[:ku-1] , ts[ku-1] , ts[ku] )
        print '[ ' + ''.join(ts) + ' ]' ,
        print 'stop EXCEPTION' if res else 'sentence stopped'
Пример #54
0
    def _getRaw ( self ):

        """
        obtain next raw token from buffer

        arguments:
            self

        returns:
            EllyToken on success, None otherwise
        """

        self.skipSpaces()
#       print "|",len(self.buffer)
        ln = len(self.buffer)
#       print "|",len(self.buffer)
        if ln == 0:
            return None
#       print "proceed"
            
        ## get length of next token and if it has
        ## initial - or +, check for word fragment

        k = 0                   # number of chars for next token
        
        if self.match(MIN):     # check for hyphen
            if self.match(DSH): # it is a dash when doubled
                k = 2
            else:
                k = self.find(separators,1)
        elif self.match(PLS):   # check for elly prefix
            k = self.find(separators,1)
        elif self.match(DOT):   # check for period
            if self.match(ELP): # it is ellipsis when tripled
                k = 3
            else:
                k = 1
        elif not ellyChar.isCombining(self.buffer[0]):
            k = 1               # if next char cannot start a token, take it as a token
        else:
            k = self.find(separators)
            if k < 0:           # break a token at next separator
                k = ln
            while k < ln:       # look at separator if it exists
                x = self.buffer[k]
                if x != MIN and x != COM:
                    break       # a hyphen or comma is not absolute break
                if not ellyChar.isDigit(self.buffer[k+1]):
                    break       # accept hyphen or comma if NOT followed by digit
                else:           # otherwise, look for another separator
                    k = self.find(separators,k+2)
                    if k < 0:
                        k = ln
        
        ## if token not delimited, take rest of buffer as
        ## will fit into token working area
        
        if k < 0: k = ln

#       print "take",k,"chars from",len(self.buffer),self.buffer
            
        buf = self.extract(k) # get k characters

        ## special check for - next in buffer after extraction

        if self.match(MIN):                    # hyphen immediately following?
            self.skip()                        # if so, take it
            if self.atSpace():                 # when followed by space
                buf.append(MIN)                # append hyphen to candidate token
                k += 1
            else:
                if not self.match(MIN):        # when not followed by another hyphen
                    self.prepend(ellyChar.SPC) # put back a space
                else:
                    self.skip()                # double hyphen = dash
                    self.prepend(ellyChar.SPC) # put back space after dash
                    self.prepend(MIN)          # put back second hyphen
                self.prepend(MIN)              # put back first
                self.prepend(ellyChar.SPC)     # put extra space before hyphen or dash
        
        ## fill preallocated token for current position from working area
        
#       print "raw text for token:" , '[' + u''.join(buf).encode('utf8') + ']'
        to = ellyToken.EllyToken(u''.join(buf))
        
        ## strip off trailing non-token chars from token and put back in buffer
        
        km = k - 1
        while km > 0:
            x = buf[km]
            if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS:
                break
            if x == APO and km > 0 and buf[km - 1] == 's':
                break
            self.prepend(x)
            km -= 1
        km += 1
        if km < k:
            to.shortenBy(k - km,both=True)
        
        return to
Пример #55
0
    def _aDay(self, ts):
        """
        parse a day number

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        #       print 'aDay', ts

        if len(ts) == 0:
            return 0

        k = 0  # running match count
        x = ts[0]
        y = ''
        if not ellyChar.isDigit(x):
            if not self.rewriteNumber(ts):
                return 0
            else:
                x = ts[0]

#       print 'rewritten ts=' , ts

        ls = len(ts)
        if ls == 1:
            if x == '0': return 0  # cannot have 0 as day
            self._dy.append(x)  # accept at end of input as possible date
            return 1
        elif not ellyChar.isDigit(ts[1]):
            k = 1
        elif x > '3':  # reject first digit bigger than '3'
            return 0
        else:
            y = x  # save first digit
            x = ts[1]  # this will be second digit
            if y == '3' and x > '1':  # reject day > 31
                return 0
            k = 2

        ls -= k
        if k == 2:
            self._dy.append(y)
        self._dy.append(x)
        if ls == 0:
            return k

        z = ts[k]
        if ellyChar.isDigit(z):
            return 0  # reject 3-digit day

        if z == '.' and ls > 1 and ellyChar.isDigit(ts[k + 1]):
            return 0  # reject digit after decimal point

        if ls >= 2:  # at least 2 chars to check after day number
            if z == u'-':
                #               print 'hypen ls=' , ls , 'k=' , k
                if ellyChar.isDigit(ts[k + 1]):  # hyphen, digit match
                    #                   print 'digit=' , ts[k+1]
                    self._dy.append(z)
                    self._dy.append(ts[k + 1])
                    if ls == 2:  # only 2 chars to check?
                        k += 2  # add hyphen, digit to day
                    elif ls == 3:  # only 3 chars to check?
                        #                       print 'ts[k]=' , ts[k:]
                        if not ellyChar.isLetterOrDigit(ts[k + 2]):  #
                            k += 2  # add hyphen, digit to day
                        elif ellyChar.isDigit(
                                ts[k + 2]):  # found second digit to add?
                            self._dy.append(ts[k +
                                               2])  # if so, add to day string
                            k += 3
                    elif not ellyChar.isLetterOrDigit(
                            ts[k + 2]):  # more than 3 chars to check?
                        k += 2  # if not, we are done
                    elif ellyChar.isDigit(ts[k + 2]):  # check for second digit
                        #                       print 'k=' , k
                        if ls > 3 and ellyChar.isDigit(ts[k + 3]):
                            return 0
                        if ts[k + 1] > '3':  # check for valid day
                            return 0
                        if ts[k + 1] == '3' and ts[k + 2] > '1':
                            return 0
                        self._dy.append(ts[k + 2])
                        k += 3
                    else:
                        return 0  # no other hyphen allowed in day
                else:
                    return 0  #

        t = ts[k:]
        #       print 'k=' , k , 't=' , t
        if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]):
            return k

        if ellyChar.isDigit(t[0]) or len(t) < 2:
            return 0
        sx = t[0].lower() + t[1].lower()

        #       print 'y=' , y , 'x=' , x , 'sx=' , sx

        if x == '1':
            #           print 'end of day=' , y
            if y == '1':
                if sx != 'th': return 0
            elif sx != 'st': return 0
        elif x == '2':
            if sx != 'nd': return 0
        elif x == '3':
            if sx != 'rd': return 0
        else:
            #           print 'default ordinal indicator'
            if sx != 'th': return 0

#       print 'ord k=' , k
        t = t[2:]
        k += 2

        #       print 'k=' , k , 'len=' , len(ts)

        if len(ts) == k:  # check next char in stream
            return k  # if none, match succeeds
        elif ellyChar.isLetterOrDigit(ts[k]):
            #           print 'ts[k]=' , ts[k] , k
            return 0  # otherwise, match fails if next char is alphanumeric
        else:
            #           print 'return k=' , k
            return k  # otherwise succeed
Пример #56
0
    def _matchN(self, ts):
        """
        apply logic for numeric only date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        #       print 'NUMERIC'

        lts = len(ts)
        if lts < Lm: return 0  # shortest date is 0/0
        if not ellyChar.isDigit(ts[0]): return 0

        n = Ln
        if n > lts: n = lts

        ss = []  # substring to compare
        ns = 0  # slash count

        #       print 'lts=' , lts , 'n=' , n

        k = 0
        while k < n:
            c = ts[k]
            if c == '/':
                ns += 1
            elif c == '-':
                ns += 1
                c = '/'
            elif not ellyChar.isDigit(c):
                break
            ss.append(c)
            k += 1

#       print 'k=', k , 'Lm=' , Lm , 'ns=' , ns
        if k < Lm: return 0
        if ns != 1 and ns != 2: return 0

        #       print 'ss=' , ss

        if k < lts and ellyChar.isLetterOrDigit(ts[k]):
            return 0

        dt = ''.join(ss).split('/')

        dt0 = dt.pop(0)  # get first two date components
        dt1 = dt.pop(0)  #

        #       print 'split=' , dt0 , dt1

        if len(dt0) == 4 or dt0[0] == '0':
            if ns == 1: return 0  #
            dt.append(dt0)  # put first component at end if it looks like year
            dt0 = dt1  # move month up
            dt1 = dt.pop()  # move date  up

        m = int(dt0)
        if m < 1 or m > 12: return 0  # check validity of month
        if dt1 == '': return 0
        try:
            d = int(dt1)
        except ValueError:
            return 0
        if d < 1 or d > 31: return 0  # check validity of day
        if ns == 2:
            y = dt.pop(0)  # if there is a year, process it also
            ly = len(y)
            if ly == 4:  # 4-digit year?
                s = y[0]
                if s != '1' and s != '2': return 0
                yls = list(y)
            elif ly == 2:
                ix = 0 if y > self.ycur else 1
                yls = list(self.cent[ix] + y)
            else:
                return 0  # fail on any other number of year digits

            self._yr = yls  # handle year

        self._mo = list(dt0.zfill(2))  # handle month
        self._dy = list(dt1.zfill(2))  # handle day
        return k
Пример #57
0
    def _extractToken(self, mnl):
        """
        extract next token from input buffer and look up in grammar table

        arguments:
            self  -
            mnl   - minimum length for any previous match

        returns:
            ellyToken on success, otherwise None

        exceptions:
            ParseOverflow
        """

        d = self.rul  # grammar rule definitions

        tree = self.ptr  # parse tree
        buff = self.sbu  # input source

        #       print ( 'start extraction' )
        try:
            w = buff.getNext()  # extract next token
            #           print ( 'got token=' , w )
            ws = ''.join(w.root)
        except ellyException.StemmingError as e:
            #           print ( 'FATAL error' , e , file=sys.stderr )
            sys.exit(1)
#       print ( 'extracted' , '['+ ws + ']' )
        wcapzn = w.isCapitalized()
        wsplit = w.isSplit()

        wl = len(ws)
        if wl > mnl:
            found = self._simpleTableLookUp(ws, tree, wsplit, wcapzn) > 0
#           print ( 'found in external table=' , found )

        if wl >= mnl:
            if ws in self.rul.gtb.dctn:  # look up internally
                #               print ( v'"' + ws + '" in dictionary' )
                if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn):
                    found = True

#       print ( 'found in internal dictionary=' , found )
        if found:  # if any success, we are done
            return w
        if mnl > 0:
            return None  # defer to previous lookup

#       print ( 'affix logic:' )
#       print ( d.man.pref )
#       print ( d.man.suff )
        dvdd = False
        if d.man.analyze(w):  # any analysis possible?
            root = ''.join(w.root)  # if so, get parts of analysis
            tan = w.pres + [root] + w.sufs
            if len(w.sufs) > 0:
                sx = w.sufs[-1]
                dvdd = not ellyChar.isApostrophe(sx[1])
#           print ( 'token analysis=' , tan )
            while len(tan) > 0:  # and put back into input
                x = tan.pop()
                buff.prepend(x)
                buff.prepend(' ')
            w = buff.getNext()  # get token again with stemming and macros

            #           print ( 'analyzed w=' , w )

            ws = ''.join(w.root)

            if ws[-1] == '+':
                #               print ( 'len(queue)=' , len(tree.queue) )
                m = d.ptb.match(w.root, tree)
                #               print ( 'root=' , w.root )
                #               print ( 'match=' , m )
                #               print ( 'len(queue)=' , len(tree.queue) )
                #               print ( 'char span=' , tree.lastph.lens )
                if m > 0:
                    tree.lastph.bias = 2
                    found = True

#           print ( 'after found=' , found )
            if len(ws) < mnl: return None  # external lookup?
            if self._simpleTableLookUp(ws, tree, False,
                                       wcapzn):  # external lookup
                found = True

            if ws in self.rul.gtb.dctn:  # internal lookup?
                if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn):
                    found = True

        if found:  # if any success, we are done
            #           print ( 'token recognized' )
            w.dvdd = dvdd
            return w

#       print ( 'still unrecognized token w=' , str(w) )

        lws = len(ws)
        if lws > 1:  # special handling of + or -
            if ws[0] == '+' and ws[-1] != '+':
                #               print ( 'root=' , ws )      # marks root with prefixes removed
                if self._simpleTableLookUp(ws[1:], tree) > 0:
                    return w
            if ws[0] == '-':
                w.shortenBy(lws - 1)  # -X not recognized as suffix
                #               print ( 'w=' , w )          # try processing - separately
                cn = buff.peek()
                if ellyChar.isLetterOrDigit(cn):
                    buff.prepend(' ')
                buff.prepend(ws[1:])  # put back X for further analysis

        if self.pnc.match(w.root):  # check if next token is punctuation
            #           print ( 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal() )
            if tree.addLiteralPhrase(self.pnc.catg, self.pnc.synf):
                tree.lastph.lens = w.getLength()
                tree.lastph.krnl.semf.combine(self.pnc.semf)


#               print ( 'semf=' , self.pnc.semf )
#               print ( 'lastph=' , tree.lastph )
#           print ( 'punc w=' , str(w) )
        else:
            #           print ( 'must create UNKN leaf node' )
            tree.createUnknownPhrase(w)  # unknown type as last resort
            tree.lastph.lens = len(ws)

        return w
Пример #58
0
        list('XXX: Boo'),
        list('S.A.F. \u201cA'),
        list('2002. \u201cA'),
        list('2:45 a.m. Friday')
    ]

    nlu = len(sys.argv) - 2
    if nlu > 0:  # add to test cases?
        for a in sys.argv[2:]:  # get commandline args to test
            test.append(list(a))
        print('added', nlu, 'test case' + ('' if nlu == 1 else 's'))
    else:
        print('no added test cases')
    print('--------')
    print(len(test), 'cases in all')

    for ts in test:
        ku = 0
        lu = len(ts)
        for cu in ts:  # scan input line
            ku += 1
            if cu in stpx.lstg:  # find first candidate stop
                if ku == lu or not ellyChar.isLetterOrDigit(ts[ku]):
                    break  # must not be followed by letter or digit
        else:
            print(ts, 'SKIPPED')
            continue
        res = stpx.match(ts[:ku - 1], ts[ku - 1], ts[ku:])
        print('[ ' + ''.join(ts) + ' ] @', ku - 1, end=' ')
        print('stop EXCEPTION' if res else 'sentence will stop')