예제 #1
0
def matchtoo(txt, pnc, ctx):
    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
    #   print ( 'nomatch() ln=' , ln , txt )
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print ( 'check' , txt[-3:] )
    if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [
            'P', 'p', 'A', 'a'
    ] or txt[-4] != ' ':
        return False
    ch = txt[-5]
    #   print ( 'ch=' , ch )
    if ellyChar.isDigit(ch):  # only 1 digit will be checked here!
        #       print ( 'ONE DIGIT' )
        return True  # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print ( 'nn=' , nn )
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [' ', '-']:
            return False
    wd = ''.join(txt[:-nn]).lower()

    #   print ( 'wd=' , wd )
    if wd in [
            'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
            'nine', 'ten', 'eleven', 'twelve'
    ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False
예제 #2
0
def matchtoo ( txt , pnc , ctx ):

    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
#   print 'nomatch() ln=' , ln , txt
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print 'check' , txt[-3:]
    if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ':
        return False
    ch = txt[-5]
#   print 'ch=' , ch
    if ellyChar.isDigit(ch):        # only 1 digit will be checked here!
#       print 'ONE DIGIT'
        return True                 # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print 'nn=' , nn
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [ ' ' , '-' ]:
            return False
    wd = ''.join(txt[:-nn]).lower()

#   print 'wd=' , wd
    if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' ,
               'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False
예제 #3
0
    def _find ( self , cmpo , smpl=True ):

        """
        lookup method with recursion

        arguments:
            self  -
            cmpo  - simple or compound component
            smpl  - simple flag

        returns:
            -1 or -2 if not found, component type code >= 0 otherwise
        """

#       print '_find:' , cmpo
        lcmp = len(cmpo)
        if lcmp == 0:
            return NON
        if cmpo in self.dictn:               # full name component known?
            return self.dictn[cmpo]
        if lcmp == 1:
            return INI if ellyChar.isLetter(cmpo[0]) else NON

        if cmpo[-1] == '.':                  # component ends in '.'?
            if lcmp == 2:
                if ellyChar.isLetter(cmpo[0]):
                    return INI
            return NON

        if smpl and lcmp > 4:                # check component by parts?
            pre = cmpo[:2]
            suf = cmpo[-2:]
#           print 'pre=' , pre , 'suf=' , suf
            if pre in self.pres:             # if not known, check for prefix match
                for p in self.pres[pre]:
                    x = p[0]
                    n = len(x)
#                   print 'recursion=' , p[2]
                    if (n < lcmp and
                        cmpo[:n] == x):      # prefix match found?
                        if not p[2] or self._find(cmpo[n:]) > 0:
                            return p[1]
            elif suf in self.posts:          # last resort is check for suffix match
                for p in self.posts[suf]:
                    x = p[0]
                    n = len(x)
#                   print 'recursion=' , p[2]
                    if (n < lcmp and
                        cmpo[-n:] == x):     # suffix match found?
                        if not p[2] or self._find(cmpo[:-n]) > 0:
                            return p[1]
        return NON
예제 #4
0
    def _find(self, cmpo, smpl=True):
        """
        lookup method with recursion

        arguments:
            self  -
            cmpo  - simple or compound component
            smpl  - simple flag

        returns:
            -1 or -2 if not found, component type code >= 0 otherwise
        """

        #       print '_find:' , cmpo
        lcmp = len(cmpo)
        if lcmp == 0:
            return NON
        if cmpo in self.dictn:  # full name component known?
            return self.dictn[cmpo]
        if lcmp == 1:
            return INI if ellyChar.isLetter(cmpo[0]) else NON

        if cmpo[-1] == '.':  # component ends in '.'?
            if lcmp == 2:
                if ellyChar.isLetter(cmpo[0]):
                    return INI
            return NON

        if smpl and lcmp > 4:  # check component by parts?
            pre = cmpo[:2]
            suf = cmpo[-2:]
            #           print 'pre=' , pre , 'suf=' , suf
            if pre in self.pres:  # if not known, check for prefix match
                for p in self.pres[pre]:
                    x = p[0]
                    n = len(x)
                    #                   print 'recursion=' , p[2]
                    if (n < lcmp and cmpo[:n] == x):  # prefix match found?
                        if not p[2] or self._find(cmpo[n:]) > 0:
                            return p[1]
            elif suf in self.posts:  # last resort is check for suffix match
                for p in self.posts[suf]:
                    x = p[0]
                    n = len(x)
                    #                   print 'recursion=' , p[2]
                    if (n < lcmp and cmpo[-n:] == x):  # suffix match found?
                        if not p[2] or self._find(cmpo[:-n]) > 0:
                            return p[1]
        return NON
예제 #5
0
def _planAhead ( buf ):

    """
    check for possible problems in the next scan while context
    is still available and set flags if needed

    arguments:
        buf  - buffer to be scanned
    """

    global _toscan

    nsk = 0                     # total skip count
    lb = len(buf)
    if lb > 4:
        if buf[0] == '(':       # skip initial '('
            nsk += 1
            buf = buf[1:]
        if buf[0] == '"':       # skip initial '"'
            nsk += 1
            buf = buf[1:]
        lb -= nsk

    nix = 0                    # scan count
    if lb > 8:
        for chx in buf:        # go to first non-letter
            if not ellyChar.isLetter(chx):
                if ellyChar.isWhiteSpace(chx):
                    break      # must be space
                return
            nix += 1

        sst = ''.join(buf[:nix]).lower()
        if not sst in _det:
            return            # must find determiner

        nix += 1              # skip space
        if ellyChar.isUpperCaseLetter(buf[nix]):
            nix += 1          # skip first letter
            buf = buf[nix:]
            for ch in buf:    # go to next non-letter
                if not ellyChar.isLetter(ch):
                    if ellyChar.isWhiteSpace(ch):
                        break
                    return
                nix += 1

            _toscan = lb + nsk - nix
예제 #6
0
def _planAhead ( buf ):

    """
    check for possible problems in the next scan while context
    is still available and set flags if needed

    arguments:
        buf  - buffer to be scanned
    """

    global _toscan

    nsk = 0                     # total skip count
    lb = len(buf)
    if lb > 4:
        if buf[0] == '(':       # skip initial '('
            nsk += 1
            buf = buf[1:]
        if buf[0] == '"':       # skip initial '"'
            nsk += 1
            buf = buf[1:]
        lb -= nsk

    nix = 0                    # scan count
    if lb > 8:
        for chx in buf:        # go to first non-letter
            if not ellyChar.isLetter(chx):
                if ellyChar.isWhiteSpace(chx):
                    break      # must be space
                return
            nix += 1

        sst = ''.join(buf[:nix]).lower()
        if not sst in _det:
            return            # must find determiner

        nix += 1              # skip space
        if ellyChar.isUpperCaseLetter(buf[nix]):
            nix += 1          # skip first letter
            buf = buf[nix:]
            for ch in buf:    # go to next non-letter
                if not ellyChar.isLetter(ch):
                    if ellyChar.isWhiteSpace(ch):
                        break
                    return
                nix += 1

            _toscan = lb + nsk - nix
예제 #7
0
def acronym ( buffr ):

    """
    recognize parenthesized introduction of acronym in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Lmax: lb = Lmax
    if lb < Lmin or buffr[0] != '(': return 0

    nu = 0          # uppercase count
    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == ')':
            break
        if not ellyChar.isLetter(bc): return 0
        if ellyChar.isUpperCaseLetter(bc): nu += 1
    else:
        return 0    # must have enclosing ')'

    if ib < Lmin or ib - 2*nu > 0: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
예제 #8
0
    def getRules ( self , a ):

        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

#       print 'getRules(a=' , a , ')'
        if a == '': return [ ]
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
#           print 'index a=' , a , 'k=' , k
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls,ws)
            uniqueAdd(ls,self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls,self.anyWx)
#       print len(ls) , ' rules to check'
        return [ r.unpack() for r in ls ]
예제 #9
0
def normalize ( s ):

    """
    convert all non-ASCII nonalphanumeric in sequence to _ and 
    consecutive white spaces to a single space char

    arguments:
        s   - input sequence to operate on
    """

    spaced = False
    k = 0
    n = len(s)
    for i in range(n):
        x = s[i]
        if ellyChar.isLetter(x):
            spaced = False
        elif ellyChar.isWhiteSpace(x):
            if spaced: continue
            x = ' '
            spaced = True
        elif ord(x) > 127:
            x = '_'
            spaced = False
        else:
            spaced = False
        s[k] = x
        k += 1
    s = s[:k]
예제 #10
0
    def normalize(self, s):
        """
        overrides method in parent class to convert all letters to _
        and to eliminate any white space

        arguments:
            self -
            s    - Unicode string or char list to operate on
        returns:
            normalized sequence
        """

        #       print 'ZH normalize'
        n = len(s)
        ns = []
        for i in range(n):
            x = s[i]
            #           print '     x=' , x
            if ellyChar.isLetter(x):
                x = '_'
            elif ellyChar.isWhiteSpace(x):
                continue
#           print 'norm x=' , x
            ns.append(x)
#       print 'norm=' , ns
        return ns
예제 #11
0
def normalize ( s ):

    """
    convert all unrecognizable input chars to _ and any
    consecutive white spaces to a single space

    arguments:
        s   - Unicode string or char list to operate on
    returns:
        normalized sequence
    """

    spaced = False
    n = len(s)
    ns = [ ]
    for i in range(n):
        x = s[i]
        if ellyChar.isLetter(x):
            spaced = False
        elif ellyChar.isWhiteSpace(x):
            if spaced: continue
            x = ' '
            spaced = True
        elif not ellyChar.isText(x):
            x = '_'
            spaced = False
        else:
            spaced = False
        ns.append(x)
    return ns
예제 #12
0
    def __init__ ( self , symtb , defr ):

        """
        initialization

        arguments:
            self  -
            symtb - symbol table for interpreting syntax
            defr  - definition input string
        """

        self._errcount = 0
#       print ( 'defr=' , defr )
        ru = defr.split(' : ')
        if len(ru) != 2:
            self._err('incomplete template',defr)
            return
        [ elems , defns ] = ru
        rw = elems.split(' ')
        if len(rw) < 2:
            self._err('trivial template',defr)
            return
        le = [ ]
        for w in rw:
#           print ( 'w=' , w )
            x = w.strip()
            lx = len(x)
            if lx == 0:
                self._err('null template element',defr)
                return
            if x[0] == '%':
                if lx > 1 and ellyChar.isLetter(x[1]):
                    if lx > 2:
                        if x[1] != '*':
                            self._err('bad class ID',defr)
                            return
                    x = x.lower()
            le.append(x)
        if self._errcount > 0: return
        self.listing = le

        de = defns.split(' ')
        lde = len(de)
        if lde != 1 and lde != 3:
            self._err('bad template definition',defr)
            return
        syns = de[0]
        sems = de[1] if lde > 1 else None
        try:
            spec = syntaxSpecification.SyntaxSpecification(symtb,syns)
            semf = featureSpecification.FeatureSpecification(symtb,sems,True)
        except ellyException.FormatFailure:
            self._err('bad definition' , defr)
            return

        self.lstg = le
        self.catg = spec.catg
        self.synf = spec.synf.positive
        self.semf = semf.positive
        self.bias = int(de[2]) if lde > 1 else 0
예제 #13
0
    def __init__ ( self , symtb , defr ):

        """
        initialization

        arguments:
            self  -
            symtb - symbol table for interpreting syntax
            defr  - definition input string
        """

        self._errcount = 0
#       print 'defr=' , defr
        ru = defr.split(' : ')
        if len(ru) != 2:
            self._err('incomplete template',defr)
            return
        [ elems , defns ] = ru
        rw = elems.split(' ')
        if len(rw) < 2:
            self._err('trivial template',defr)
            return
        le = [ ]
        for w in rw:
#           print 'w=' , w
            x = w.strip()
            lx = len(x)
            if lx == 0:
                self._err('null template element',defr)
                return
            if x[0] == '%':
                if lx > 1 and ellyChar.isLetter(x[1]):
                    if lx > 2:
                        if x[1] != '*':
                            self._err('bad class ID',defr)
                            return
                    x = x.lower()
            le.append(x)
        if self._errcount > 0: return
        self.listing = le

        de = defns.split(' ')
        lde = len(de)
        if lde != 1 and lde != 3:
            self._err('bad template definition',defr)
            return
        syns = de[0]
        sems = de[1] if lde > 1 else None
        try:
            spec = syntaxSpecification.SyntaxSpecification(symtb,syns)
            semf = featureSpecification.FeatureSpecification(symtb,sems,True)
        except ellyException.FormatFailure:
            self._err('bad definition' , defr)
            return

        self.lstg = le
        self.catg = spec.catg
        self.synf = spec.synf.positive
        self.semf = semf.positive
        self.bias = int(de[2]) if lde > 1 else 0
예제 #14
0
def acronym ( buffr ):

    """
    recognize parenthesized introduction of acronym in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Lmax: lb = Lmax
    if lb < Lmin or buffr[0] != '(': return 0

    nu = 0          # uppercase count
    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == ')':
            break
        if not ellyChar.isLetter(bc): return 0
        if ellyChar.isUpperCaseLetter(bc): nu += 1
    else:
        return 0    # must have enclosing ')'

    if ib < Lmin or ib - 2*nu > 0: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
예제 #15
0
    def getRules(self, a):
        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

        #       print ( 'getRules(a=' , a , ')' )
        if a == '': return []
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
            #           print ( 'index a=' , a , 'k=' , k )
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls, ws)
            uniqueAdd(ls, self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls, self.anyWx)
#       print ( len(ls) , ' rules to check' )
        return [r.unpack() for r in ls]
예제 #16
0
    def normalize(self, s):
        """
        convert all unrecognizable input chars to _ and any
        consecutive white spaces to a single space

        arguments:
            self -
            s    - Unicode string or char list to operate on
        returns:
            normalized sequence
        """

        #       print ( '__ normalize' )
        spaced = False
        n = len(s)
        ns = []
        for i in range(n):
            x = s[i]
            if ellyChar.isLetter(x):
                spaced = False
            elif ellyChar.isWhiteSpace(x):
                if spaced: continue
                x = ' '
                spaced = True
            elif not ellyChar.isText(x):
                x = '_'
                spaced = False
            else:
                spaced = False
            ns.append(x)
        return ns
예제 #17
0
    def divide ( self , word ):

        """
        apply inflectional analysis, including for -'s and -s'

        arguments:
            self  -
            word  - ellyToken

        exceptions:
            StemmingError
        """

#       print "divide" , word
        wl = word.getLength()       # if so, is word long enought to be divided
        if wl < 3:
            return                  # if not, done

        if word.isAffix():          # if term is already product of division, stop
            return

#       print 'word suffixes=' , word.sufs , word.dvdd

        x = word.charAt(wl-1)       # get last two chars of word
        y = word.charAt(wl-2)
#       print 'word= ...' , y , x

        if   x == ESS and (
             y == APO or y == APX   # check for -'S
        ):
#           print "-'s ending"
            word.shortenBy(2)
            self.putSuffixBack(SFX)
#           print 'word=' , word , 'without -\'S'
            return

        elif y == ESS and (
             x == APO or x == APX   # check for implied -'S
        ):
#           print "-s' ending"
            word.shortenBy(1)
            self.putSuffixBack(SFX)
#           print 'word=' , word , 'without -\''
            return

        if ellyChar.isLetter(word.charAt(0)):
#           print 'apply stemmer'
            self.stemmer.apply(word)  # apply any inflectional stemmer
#           print 'word= ' , word
            if word.isSplit():
#               print 'is split'
                sufs = word.getSuffixes()
#               print 'pres=' , word.getPrefixes()
#               print 'sufs=' , sufs
                while len(sufs) > 0:
                    self.putSuffixBack(sufs.pop())
예제 #18
0
    def divide(self, word):
        """
        apply inflectional analysis, including for -'s and -s'

        arguments:
            self  -
            word  - ellyToken

        exceptions:
            StemmingError
        """

        #       print ( "divide" , word )
        wl = word.getLength()  # if so, is word long enought to be divided
        if wl < 3:
            return  # if not, done

        if word.isAffix():  # if term is already product of division, stop
            return

#       print ( 'word suffixes=' , word.sufs , word.dvdd )

        x = word.charAt(wl - 1)  # get last two chars of word
        y = word.charAt(wl - 2)
        #       print ( 'word= ...' , y , x )

        if x == ESS and (y == APO or y == APX  # check for -'S
                         ):
            #           print ( "-'s ending" )
            word.shortenBy(2)
            self.putSuffixBack(SFX)
            #           print ( 'word=' , word , 'without -\'S' )
            return

        elif y == ESS and (x == APO or x == APX  # check for implied -'S
                           ):
            #           print ( "-s' ending" )
            word.shortenBy(1)
            self.putSuffixBack(SFX)
            #           print ( 'word=' , word , 'without -\'' )
            return

        if ellyChar.isLetter(word.charAt(0)):
            #           print ( 'apply stemmer' )
            self.stemmer.apply(word)  # apply any inflectional stemmer
            #           print ( 'word= ' , word )
            if word.isSplit():
                #               print ( 'is split' )
                sufs = word.getSuffixes()
                #               print ( 'pres=' , word.getPrefixes() )
                #               print ( 'sufs=' , sufs )
                while len(sufs) > 0:
                    self.putSuffixBack(sufs.pop())
예제 #19
0
def isNewRule(s):
    """
    check for start of new rule or procedure in processing input lines

    arguments:
        s   - input line as string

    returns:
        True if new rule or procedure, False otherwise
    """

    return (len(s) > 2 and ellyChar.isLetter(s[0]) and s[1] == ':')
예제 #20
0
def isNewRule ( s ):

    """
    check for start of new rule or procedure in processing input lines

    arguments:
        s   - input line as string

    returns:
        True if new rule or procedure, False otherwise
    """

    return (len(s) > 2 and ellyChar.isLetter(s[0]) and s[1] == ':')
예제 #21
0
def alphc ( s ):
    """
    check that all chars are letters
    arguments:
        s   - list of chars
    returns:
        True if string is all alphabetic, False otherwise
    """
#   print 'alphc s=' , s
    if len(s) == 0: return False
    for c in s:
#       print 'c=' , c , type(c) , len(c)
        if not ellyChar.isLetter(c): return False
    return True
예제 #22
0
def alphc ( s ):
    """
    check that all chars are letters
    arguments:
        s   - list of chars
    returns:
        True if string is all alphabetic, False otherwise
    """
#   print ( 'alphc s=' , s )
    if len(s) == 0: return False
    for c in s:
#       print ( 'c=' , c , type(c) , len(c) )
        if not ellyChar.isLetter(c): return False
    return True
예제 #23
0
def _scan ( buffr ):
    """
    count chars to first non-alphabetic char
    arguments:
        buffr - list of chars
    returns:
        number of letters
    """
    n = 0
    ln = len(buffr)
    while n < ln:
        if not ellyChar.isLetter(buffr[n]):
            break
        n += 1
    return n
예제 #24
0
def _scan ( buffr ):
    """
    count chars to first non-alphabetic char
    arguments:
        buffr - list of chars
    returns:
        number of letters
    """
    n = 0
    ln = len(buffr)
    while n < ln:
        if not ellyChar.isLetter(buffr[n]):
            break
        n += 1
    return n
예제 #25
0
    def divide ( self , word ):

        """
        apply inflectional analysis, including for -'s and -s'

        arguments:
            self  -
            word  - ellyToken

        exceptions:
            StemmingError
        """

#       print "divide" , word
        wl = word.getLength()       # if so, is word long enought to be divided
        if wl < 3:
            return                  # if not, done

        if word.isAffix():          # if term is already product of division, stop
            return

        x = word.charAt(wl-1)       # get last two chars of word
        y = word.charAt(wl-2)
#       print 'word= ...' , y , x

        if x == u's' and y == APO:  # check for -S'
            word.addSuffix(APO+ESS)
            word.shortenBy(2)
#           print 'word=' , word
            return

        elif x == APO and y == ESS: # check for implied -'S
            word.addSuffix(APO+ESS)
            word.shortenBy(1)
#           print 'word=' , word
            return

        if ellyChar.isLetter(word.charAt(0)):
            self.stemmer.apply(word)  # apply any inflectional stemmer
            if word.isSplit():
                sufs = word.getSuffixes()
#               print 'sufs=' , sufs
                while len(sufs) > 0:
                    if self.atToken(): self.prepend(ellyChar.SPC)
                    self.prepend(sufs.pop())
예제 #26
0
def timePeriod ( buffr ):

    """
    recognize time period in a day

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

#   print ( 'buffr=' , buffr )
    ln = len(buffr)
    if ln == 0 or not ellyChar.isLetter(buffr[0]):
        return 0
    k = _scan(buffr)
#  g( '0 k=' , k )
    if k == ln or buffr[k] != ' ':
        return 0
    a = ''.join(buffr[:k]).lower()
#   print ( 'a=' , a )
    if a in _modifier:
        n = k + 1
        buffr = buffr[n:]
    else:
        n = 0
    k = _scan(buffr)
#   print ( '1 k=' , k )
    if k < 6:
        return 0
    b = ''.join(buffr[:k]).lower()
    if not b in _day:
        return 0
    buffr = buffr[k:]
    n += k
    if len(buffr) < 5 or buffr[0] != ' ':
        return n if n > k else 0
    m = _scan(buffr[1:])
#   print ( '2 m=' , m )
    c = ''.join(buffr[1:m+1]).lower()
    if c in _period:
        return n + m + 1
    else:
        return n if n > k else 0
예제 #27
0
def timePeriod ( buffr ):

    """
    recognize time period in a day

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

#   print 'buffr=' , buffr
    ln = len(buffr)
    if ln == 0 or not ellyChar.isLetter(buffr[0]):
        return 0
    k = _scan(buffr)
#   print '0 k=' , k
    if k == ln or buffr[k] != ' ':
        return 0
    a = u''.join(buffr[:k]).lower()
#   print 'a=' , a
    if a in _modifier:
        n = k + 1
        buffr = buffr[n:]
    else:
        n = 0
    k = _scan(buffr)
#   print '1 k=' , k
    if k < 6:
        return 0
    b = u''.join(buffr[:k]).lower()
    if not b in _day:
        return 0
    buffr = buffr[k:]
    n += k
    if len(buffr) < 5 or buffr[0] != ' ':
        return n if n > k else 0
    m = _scan(buffr[1:])
#   print '2 m=' , m
    c = u''.join(buffr[1:m+1]).lower()
    if c in _period:
        return n + m + 1
    else:
        return n if n > k else 0
예제 #28
0
    def _enstrg(self):
        """
        get chars from auxiliary buffer

        arguments:
            self

        returns:
            chars joined into a string
        """

        self.deleteCharsFromBuffer(100)
        chi = list(self.getDeletion())
        cho = []
        for ch in chi:
            if ellyChar.isLetter(ch):
                cho.append(ellyChar.Unmapping[ellyChar.Mapping[ord(ch)]])
        return "".join(cho)
예제 #29
0
    def getRules ( self , a ):

        """
        get appropriate macros for text starting with specified first char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of macro rules to try out
        """

        if a == '': return [ ]
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            ls = self.index[k] + ws + self.anyWx
        else:
            ls = self.index[0] + self.anyWx
        return ls
예제 #30
0
    def build(self, inp):
        """
        build tree logic from definition reader input

        arguments:
            self  -
            inp   - definition text for logic

        exceptions:
            TableFailure on error
        """

        if inp == None:
            return

        nerr = 0  # error count

        # read in affixes and associated actions

        while True:

            line = inp.readline()  # next input line
            if line == '':  # check for EOF
                break

            modf = ''
            elem = line.strip().lower().split(' ')
            #           print ( 'elem=' , elem )
            le = len(elem)
            if le < 4:
                nerr += 1
                print("** affix error: incomplete input", file=sys.stderr)
                print("*  at: [", line, "]", file=sys.stderr)
                continue  # skip incomplete line
            if le > 4:  # affix mod specified?
                modf = elem.pop()  # if so, get it
#               print ( elem[0] , modf )
            do = elem.pop()  # note main action

            # get affix within definition line

            aff = list(elem.pop(0))  # affix as list of chars

            # check for proper form

            aff = self.sequence(aff)  # backward or forward  matching?
            #           print ( 'aff=' , aff )

            c = aff[0]  # get first char to compare with
            aff = aff[1:]

            if (not ellyChar.isLetter(c)
                    and c != '+'):  # affix must start with letter or '+'
                nerr += 1
                print("** affix error: must start with letter or '+'",
                      file=sys.stderr)
                print("*  at: [", line, "]", file=sys.stderr)
                continue  # ignore line

            if not c in self.indx:  # node not already in tree index?
                self.indx[c] = Node()  # add new node

            node = self.indx[c]

            for a in aff:  # now check each successive char in affix
                if a in node.contn:
                    node = node.contn[a]  # go to existing node if found
                else:
                    new = Node()  # otherwise make new node
                    node.contn[a] = new  # and insert into tree
                    node = new  # and move down

            # at final node in tree logic

            node.condn = int(elem.pop(0))  # condition for match

            try:
                nsave = 0 if len(elem) == 0 else int(elem.pop())
            except ValueError as e:
                print(e, file=sys.stderr)
                print("*  at: [", line, "]", file=sys.stderr)
                continue  # ignore line

            resto = [Add]  # set to defaults
            recur = False  #

            mode = do[-1]  # kind of recursion
            rest = do[:-1]  # added chars to fill out root
            #           print ( 'mode=' + '<' + mode + '>' , 'rest=' , rest )
            if mode == '?':
                node.condn = 1
                resto = [Fail]  # will generate fatal error
            elif ellyChar.isDigit(mode):
                nerr += 1
                print("* bad action mode=", mode, file=sys.stderr)
                continue
            else:
                if mode == ',':  # allow recursion?
                    recur = True  # if so, change default
                if len(rest) == 1 and rest[0] == '&':
                    resto = [RestorE]
                else:
                    resto += list(rest)

            if self.addn != None:
                resto.insert(1, self.addn)  # insert AFTER first char of list
#           print ( 'resto=' , resto )

# insert action

            node.actns = Action(self, nsave, resto, recur, modf)
            node.tag()

#           if modf != '': print ( node , node.actns )

        if nerr > 0:
            print("**", nerr, "affix errors in all", file=sys.stderr)
            raise ellyException.TableFailure
예제 #31
0
def scan ( buffr ):

    """
    recognize personal names in text at current position

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    def doLook ( mth , itm ):

        """
        do lookup with specified method using
        global variables in Python 2.7.*

        arguments:
            mth  - name table method
            itm  - string to look up
        """

        global _typ , _nch            # really need nonlocal
        _typ = mth(itm)
        if _typ < 0 and len(itm) > 3: # if no match, check for final '.'
            if itm[-1] == '.':
                _typ = mth(itm[:-1])
                if _typ >= 0:
                    _nch -= 1         # match without '.'

    global _typ , _nch
    global _toscan

#   print ( 'table=' , _table )
    bln = len(buffr)
    if _table == None or bln < 2: return 0
    if _toscan > 0:
        if bln > _toscan:
            return 0
        else:
            _toscan = 0

    chx = buffr[0]
#   print ( 'scan chx=' , chx )
    if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0

    cmps = [ ]                                 # name components this time
    ncmp = 0                                   # number of components for current name
    ninf = 0                                   # number inferred
    ntyp = len(nameTable.TYP)
    stat = [False]*ntyp                        # define state for getting personal name
    mlen = 0                                   # last match length

    bix = 0                                    # buffer index to advance in scanning
    _typ = -1
    while bix < bln:
        ltyp = -1                              # last match type
        _nch = _limit(buffr[bix:],mlen)        # length of next possible name component
#       print ( 'top _nch=' , _nch )
        if _nch == 0: return 0
        elm = _extract(buffr[bix:],_nch)       # get possible component as string
        sch = buffr[bix]
        enclosed = (sch == '(' or sch == '"')  # type of next element
        doLook(_table.lookUp,elm)              # look it up in saved name table
#       print ( 'lookUp(' , elm , ')=' , _typ )

        if _typ < 0:
            if _typ == nameTable.REJ:
                return 0                       # immediate rejection of any match
            if _typ == nameTable.STP:
                break                          # stop any more matching
            if elm[-1] == '.':                 # drop any trailing '.'
                elm = elm[:-1]
                if not enclosed:
                    _nch -= 1
            if enclosed:                       # enclosed element assumed to be name
                if not elm in _cntxt:
                    _cntxt.append(elm)         # make sure always to save in local context
                    ninf += 1                  # this is inferred!
            if elm in _cntxt:
                _typ = nameTable.XNM           # neutral name type to be noncommital

        if _typ < 0:
            tok = buffr[bix:bix + _nch]        # unknown token to check
#           print ( 'call infer with tok=' , tok )
            if infer(tok):
#               print ( 'digraph test passed' )
                _typ = nameTable.XNM           # neutral name type inferred
                if not _table.checkPhonetic(tok):
                    ninf += 1                  # count inferred component if no phonetic support
#           print ( '_typ=' , _typ )

        if nameTable.starts(_typ) and bix > 0: # if component not at start of name,
            break                              #     must stop name scan

#       print ( 'continuing bix=' , bix )
        while _typ >= 0:                       # continue as long as match is viable
            ncmp += 1                          # count up component
            cmps.append(elm)                   # save component
            bix += _nch                        # move ahead in scan
#           print ( 'bix=' , bix )
            if _typ > 0:
#               print ( '_typ=' , _typ )
                if stat[_typ]:                 # check for duplication of component type
                    if (ltyp >= 0 and
                        ltyp != _typ):         # allowed only if duplicate is consecutive
                        break
                mlen = bix                     # save index on actual match
                ltyp = _typ

            if nameTable.ends(_typ):           # if component marks end of name,
                break                          #    must stop name scan

            stat[_typ] = True                  # update match state
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component

            _nch = _limit(buffr[bix:],mlen)    # length of next possible name component
            if _nch == 0: break
            elm = _extract(buffr[bix:],_nch)   # get possible next component as string
            doLook(_table.lookUpMore,elm)      # look it up in saved name table
#           print ( 'lookUpMore(' , elm , ')=' , _typ )

        if _typ < 0:                           # while-loop terminated without break
#           print ( 'ltyp=' , ltyp , 'mlen=' , mlen )
            if ltyp < 0 or mlen == 0: break
            bix = mlen                         # restart at end of last match
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component
            continue

        break

#
#
#### additional constraints on acceptable personal name
#
#   print ( 'checking ltyp=' , ltyp )
    if (ltyp == nameTable.CNJ or
        ltyp == nameTable.REL):                # a name cannot end with these types
        mlen -= _nch                           # have to drop them from any match
        if mlen == 0: return 0
        if ellyChar.isWhiteSpace(buffr[mlen-1]):
            mlen -= 1
        ncmp -= 1
        cmps.pop()

#   print ( 'ncmp=' , ncmp )

    if ncmp == 0:                              # nothing matched?
        _planAhead(buffr)                      # check for possible problems in next scan
        return 0

#   print ( 'cmps=' , cmps )
    if ncmp == ninf:
        return 0                               # name cannot be purely inferred

#   print ( 'ncmp=' , ncmp )
    if ncmp == 1:                              # single-component name must be known or contextual
        if (not stat[nameTable.SNG] and
            not cmps[0] in _cntxt):
            return 0

#   print ( 'stat=' , stat[3:7] )
    expl = (stat[nameTable.PNM] or             # name must have a substantial component
            stat[nameTable.SNM] or
            stat[nameTable.XNM] or
            stat[nameTable.SNG])

#   print ( 'expl=' , expl )
    if (not expl and
        not (stat[nameTable.TTL] and           # or it could have just a title
             stat[nameTable.INI])):            #    and an initial
        return 0
#
####

#   print ( 'accepted mlen=' , mlen )
    for cmpo in cmps:                          # if whole name is OK,
        if not cmpo in _cntxt:                 #    remember all components
            _cntxt.append(cmpo)                #    not already listed in context

    return mlen                                # will be > 0 on successful match
예제 #32
0
    def simpleDeinflection(self, ss, ssp, ssl, mr):
        """
        handle matching of certain forms of English inflectional endings
        (override this method for other languages)

        arguments:
            self -
            ss   - input string of chars to scan for match
            ssp  - current position in input string
            ssl  - limit of matching in input
            mr   - next chars to look for in input

        returns:
            char count >= 0 on match, -1 otherwise
        """

        self.endg = ''  # null inflection by default
        if len(mr) == 0 and ssp == ssl:
            finAPO(ss, ssp - 1)
            return 0
        if ssp < 2 or ss[ssp - 2] == ' ':
            return -1
        ts = ss[ssp:]  # where to look for inflection
        mc = ss[ssp - 1]  # last char matched
        lm = len(mr)
        #       print ts , 'mc=' , mc , 'mr=' , mr
        if not ellyChar.isLetter(mc):
            return -1
        dss = ssl - ssp  #
        #       print 'dss=' , dss
        if dss == 0:  # must handle special case here
            if lm == 0:
                finAPO(ss, ssp - 1)
                return 0
        elif dss == 1:  # just a single letter left for inflection
            if lm != 0:
                return -1
            elif ts[0].lower() == 's':
                self.endg = '-s'
                finAPO(ss, ssp)
                return 1
            elif mc == 'e' and ts[0].lower() == 'd':
                self.endg = '-ed'
                return 1
        elif dss == 2:  # 2 letters for inflection
            if lm == 0 and ts[0].lower() == 'e':
                if ts[1].lower() == 'd':
                    self.endg = '-ed'
                    return 2
                elif ts[1].lower() == 's':
                    self.endg = '-s'
                    finAPO(ss, ssp + 1)
                    return 2
        elif dss == 3:  # 3 letters for inflection
            #           print 'ts=' , ts , 'mr=' , mr
            if ts[0].lower() == 'i':
                if ts[1].lower() == 'e':
                    if lm == 1 and mr[0] == 'y':
                        if ts[2].lower() == 's':
                            self.endg = '-s'
                            return 3
                        elif ts[2].lower() == 'd':
                            self.endg = '-ed'
                            return 3
                elif ts[1].lower() == 'n' and ts[2].lower() == 'g':
                    if lm == 0 or lm == 1 and mr[0] == 'e':
                        self.endg = '-ing'
                        return 3
            if lm == 0 and ts[0].lower() == mc and ts[1].lower(
            ) == 'e' and ts[2].lower() == 'd':
                self.endg = '-ed'
                return 3
        elif dss == 4:  # 4 letters for inflection
            if lm == 0 and ts[0].lower() == mc and ts[1] == 'i' and ts[
                    2].lower() == 'n' and ts[3].lower() == 'g':
                self.endg = '-ing'
                return 4
            if ts[0].lower() == 'y' and ts[1].lower() == 'i' and ts[2].lower(
            ) == 'n' and ts[3].lower() == 'g':
                if lm == 2 and mr[0] == 'i' and mr[1] == 'e':
                    self.endg = '-ing'
                    return 4

        return -1  # something other than inflection found
예제 #33
0
    def simpleDeinflection ( self , ss , ssp , ssl , mr ):

        """
        handle matching of certain forms of English inflectional endings
        (override this method for other languages)

        arguments:
            self -
            ss   - input string of chars to scan for match
            ssp  - current position in input string
            ssl  - limit of matching in input
            mr   - next chars to look for in input

        returns:
            char count >= 0 on match, -1 otherwise
        """

        self.endg = ''       # null inflection by default
        if len(mr) == 0 and ssp == ssl:
            finAPO(ss,ssp-1)
            return 0
        if ssp < 2 or ss[ssp-2] == ' ':
            return -1
        ts = ss[ssp:]        # where to look for inflection
        mc = ss[ssp-1]       # last char matched
        lm = len(mr)
#       print ts , 'mc=' , mc , 'mr=' , mr
        if not ellyChar.isLetter(mc):
            return -1
        dss = ssl - ssp      #
#       print 'dss=' , dss
        if dss == 0:         # must handle special case here
            if lm == 0:
                finAPO(ss,ssp-1)
                return 0
        elif dss == 1:       # just a single letter left for inflection
            if lm != 0:
                return -1
            elif ts[0].lower() == 's':
                self.endg = '-s'
                finAPO(ss,ssp)
                return 1
            elif mc == 'e' and ts[0].lower() == 'd':
                self.endg = '-ed'
                return 1
        elif dss == 2:       # 2 letters for inflection
            if lm == 0 and ts[0].lower() == 'e':
                if ts[1].lower() == 'd':
                    self.endg = '-ed'
                    return 2
                elif ts[1].lower() == 's':
                    self.endg = '-s'
                    finAPO(ss,ssp+1)
                    return 2
        elif dss == 3:       # 3 letters for inflection
#           print 'ts=' , ts , 'mr=' , mr
            if ts[0].lower() == 'i':
                if ts[1].lower() == 'e':
                    if lm == 1 and mr[0] == 'y':
                        if ts[2].lower() == 's':
                            self.endg = '-s'
                            return 3
                        elif ts[2].lower() == 'd':
                            self.endg = '-ed'
                            return 3
                elif ts[1].lower() == 'n' and ts[2].lower() == 'g':
                    if lm == 0 or lm == 1 and mr[0] == 'e':
                        self.endg = '-ing'
                        return 3
            if lm == 0 and ts[0].lower() == mc and ts[1].lower() == 'e' and ts[2].lower() == 'd':
                self.endg = '-ed'
                return 3
        elif dss == 4:       # 4 letters for inflection
            if lm == 0 and ts[0].lower() == mc and ts[1] == 'i' and ts[2].lower() == 'n' and ts[3].lower() == 'g':
                self.endg = '-ing'
                return 4
            if ts[0].lower() == 'y' and ts[1].lower() == 'i' and ts[2].lower() == 'n' and ts[3].lower() == 'g':
                if lm == 2 and mr[0] == 'i' and mr[1] == 'e':
                    self.endg = '-ing'
                    return 4

        return -1    # something other than inflection found
예제 #34
0
    def __init__ ( self , inpr ):

        """
        define table from text input

        arguments:
            self  -
            inpr  - EllyDefinitionReader

        throws:
            TableFailure on table definition failure
        """

        self.pres  = { }
        self.posts = { }
        self.dictn = { }
        self.phone = [ ]
        self.compn = ''
        self._nerr = 0

#       print 'TYP=' , TYP

        while True:
            lin = inpr.readline().lower()    # ignore capitalization
            if len(lin) == 0: break

            if lin[0] == '=':                # phonetic entry?
                lin = lin[1:]                # if so, remove marker
                first = ''
                if lin[0] == 'a':            # vowel is first?
                    first = 'a'              # if so, remove it
                    lin = lin[1:]
                pho = first + lin.upper()    # combine any vowel with uppercase rest
                self.phone.append(pho)       # save in phonetic list
                continue

            lins = lin.strip().split(':')

            if len(lins) != 2:               # type definition must have two parts
                self._err(lne=lin)
                continue

            typ = lins[1].strip()            # get component type
            if not typ in TYP:
                self._err('bad name component type',lin)
                continue
            cod = TYP[typ]

            els = lins[0].strip().split(' ') # name component

#           print 'type=' , '"' + typ + '"' , els

            lim = len(els)

            if lim == 1:
                cmpo = els[0]
                chf = cmpo[0]                # first char of component
                chl = cmpo[-1]               # last  char
                if chf == '-' or chf == '+':
                    if not ellyChar.isLetter(chl) or len(cmpo) < 3:
                        self._err('bad end of name',lin)
                        continue
                    dky = cmpo[-2:]          # dictionary key is 2 chars only
                    if not dky in self.posts:
                        self.posts[dky] = [ ]
                    self.posts[dky].append([ cmpo[1:] , cod , (chf == '+') ])
                elif chl == '-' or chl == '+':
                    if not ellyChar.isLetter(chf) or len(cmpo) < 3:
                        self._err('bad start of name',lin)
                        continue
                    dky = cmpo[:2]           # dictionary key is 2 chars only
                    if not dky in self.pres:
                        self.pres[dky] = [ ]
                    self.pres[dky].append([ cmpo[:-1] , cod , (chl == '+') ])
                else:
                    self.dictn[cmpo] = cod
                    if cmpo[-1] == '.':      # if ending with '.' , also save without
                        self.dictn[cmpo[:-1]] = cod
                continue

            Nix = 1
            while Nix <= lim:                # process elements of name component
                cmpo = ' '.join(els[0:Nix])
                Nix += 1
                if cmpo not in self.dictn:   # first Nix elements
                    self.dictn[cmpo] = CND
            if self.dictn[cmpo] != CND:
                self._err('name component redefined',lin)
                continue
            self.dictn[cmpo] = TYP[typ]      # put into table

        if self._nerr > 0:
            print >> sys.stderr , '**' , self._nerr, 'name errors in all'
            print >> sys.stderr , 'name table definition FAILed'
            raise ellyException.TableFailure
예제 #35
0
    def apply(self, token, extra=None):
        """
        apply inflectional stemming logic against token

        arguments:
            self  -
            token - input token
            extra - extra token char for any restoration
        returns:
            status code
        exceptions:
            StemmingError
        """

        last = None  # save last popped letter

        if len(self.table) < 2:  # check for empty table
            return isNOTM  # if so, no match
#       print ( 'at' , self.table[0] , extra )
        it = 0  # stemming logic index
        word = token.root  # list of letters in token word
        m = len(word)  # end of word
        seq = self.table[it]  # suffix to match
        it += 1  #
        n = len(seq)  # ending length to match
        if n >= m:  #
            return isNOTM  # word not long enough for ending
        msh = m - n

        # check that table is right one for word ending

        ew = m  # just past end of token word

        #       print ( "suffix length= ", n, ", word length= ", m )
        if n > 0:
            for ix in range(n):
                ew -= 1
                #               print ( word[ew], " cmp ", seq[ix] )
                if word[ew] != seq[ix]:
                    return isNOTM
            ew -= 1
#           print ( "first char before suffix=" , end=' ')
#           print ( '[' +  ( word[ew] if ew >= 0 else None ) + ']' )

# interpret table logic

        last = seq[-1] if n > 0 else extra
        word = word[:msh]  # copy of word up to removed suffix
        #       print ( 'word=' , word )
        if not ellyChar.isLetter(word[-1]):
            return isNOTM

        while True:  # advance through logic until success or failure

            opcode = self.table[it]  # next operation code to interpret
            it += 1  #
            #           print ( "opcode=", opcode )

            if opcode < 0:  # YE(S) on match with possible modifications

                # word satisfies conditions for ending removal

                word = token.root[:msh]  # word without ending
                #               print ( 'word=', word )
                #               print ( 'add or drop extra chars' )
                nm = YE - opcode  # get removal count from opcode
                #               print ( 'nm=', nm )
                if nm < 0:  # any special restoration?
                    if last == None:
                        print('FATAL stemming logic error', file=sys.stderr)
                        sys.stdout.flush()
                        sys.exit(1)
#                   print ( 'restore' , '[' + last + ']' )
                    word.append(
                        last)  # negative count restores last removed letter
                else:
                    #                   print ( 'drop' , nm , 'from [' , word , ']' )
                    while nm > 0:  # otherwise drop additional letters
                        if len(word) == 0:
                            print('FATAL stemming logic error',
                                  file=sys.stderr)
                            sys.stdout.flush()
                            sys.exit(1)
                        last = word.pop()
                        nm -= 1

#               print ( 'extend=' , self.table[it] ) # append more chars
                word.extend(self.table[it])

                token.root = word  # replace token with stemmed result
                #               print ( 'word=' , word )
                #               print ( 'root=' , token.root )

                return isMTCH  # success flag

            elif opcode == NO:  # no match

                #               print ( "fail!" )
                return isNOTM

            elif opcode == IF:

                # enter logic block if a char sequence matches

                seq = self.table[it + 1]
                #               print ( 'seq=' , seq , 'ew=' , ew , 'word=' , word[:ew] )
                sln = len(seq)
                if sln > len(word):  # enough chars to match?
                    it += self.table[it]  # if not, skip over block of logic
                else:
                    k = 0
                    #                   j = -1
                    #                   print ( 'at' , j , word[] )
                    while k < sln and word[-k - 1] == seq[k]:
                        #                       print ( 'word[' + str(k) + ']=' , word[j] )
                        k += 1
#                       j -= 1
#                   print ( 'k=' , k )
                    if k < sln:  # any characters unmatched?
                        #                       print ( 'IF no match' )
                        it += self.table[it]  # if so, skip over block of logic
                    else:
                        #                       print ( 'IF match' )
                        it += 2  # otherwise, enter logic block
                        word = word[:-sln]  # update index in word

            elif opcode == IS:

                # check whether next character is in a specified set

                if len(word) <= 0:  # any letters left in word?
                    it += self.table[it]  # if not, skip over block
                    continue
                chs = self.table[it + 1]  # get character set
                c = word[-1]
                #               print ( c, ':', chs )
                if chs.find(c) < 0:  # word character in set?
                    it += self.table[it]  # if not, skip block
                else:
                    it += 2  # if so, enter block

            elif opcode < Nlen:

                # check length of word

                k = self.table[it + 1]  # comparison length
                #               print ( "k= ", k, " : m= ", m , 'opcode=' , opcode )
                if opcode == LT:  # set match flag for type of comparison
                    match = (m < k)  #
                elif opcode == GT:  #
                    match = (m > k)  #
                elif opcode == EQ:  #
                    match = (m == k)  #
                elif opcode == NE:  #
                    match = (m != k)  #
                else:
                    return isNOTM

#               print ( "match= ", match )

                if not match:  # if no match, skip block
                    it += self.table[it]
                else:  # otherwise, go into logic of block
                    it += 2  #

            elif opcode == MO:  # continue to another logic table

                token.root = token.root[:msh]
                #               print ( 'for more, set root=' , token.root )
                return doMORE  # let other table figure out what to do

            elif opcode == VO:  # look for CVC pattern at end of stemming
                # and possibly restore -E

                word = token.root[:msh]  # strip ending from end of word
                #               print ( 'vowel check for' , word )

                me = len(word) - 2  # at possible vowel in stemming result
                # last char assumed to be consonant

                #               print ( 'me=' , me )
                if me < 0 or ellyChar.isStrongConsonant(word[me]):
                    token.root = word
                    return isMTCH

                me -= 1  # vowel found; now check for consonant
                #               print ( 'me=' , me )

                if me < 0 or ellyChar.isStrictVowel(word[me]):
                    return isMTCH
                if me <= 0 or word[me] != 'u' or word[me - 1] == 'q':
                    word.append('e')  # put back -E
                token.root = word
                #               print ( 'final word=' , word )
                return isMTCH

            else:

                return isNOTM

        raise ellyException.StemmingError
예제 #36
0
def _limit ( buffr , hstry ):

    """
    get length of next possible name component in buffer

    arguments:
        buffr - list of chars
        hstry - how much matched already

    returns:
        number of chars in continuation of last component, 0 for no next component
    """

    lnb = len(buffr)
    if lnb == 0: return 0

    bix = 0
    quot = False                           # indicate component starting with "
    parn = False                           #                             with (
    cmma = False                           #                             with ,
#   print '_limit buffr=' , buffr , 'hstry=' , hstry
    if buffr[0] == ',':                    # handle possible leading comma
        if hstry == 0 or lnb < 4: return 0
        bix += 1
        if ellyChar.isWhiteSpace(buffr[1]):
            bix += 1
        cmma = True
#       print 'for comma, bix=' , bix

    if buffr[bix] == '(':                  # handle short name in parentheses
        bix += 1
        parn = True
    if buffr[bix] == '"':                  # handle short name in double quotes
        bix += 1
        quot = True
#       print 'parn=' , parn , 'quot=' , quot
    if parn or quot:
#       print 'enclosed component from' , buffr[bix:]
        while bix < lnb:                   # collect letters for name
            chx = buffr[bix]
            if ellyChar.isWhiteSpace(chx):
                break
            elif not quot and parn and chx == ')':
                return bix + 1             # add trailing parenthesis
            elif quot and chx == '"':
                if bix + 1 < lnb and parn and buffr[bix+1] == ')':
                    return bix + 2         # add trailing quote and parenthesis
                elif not parn:
                    return bix + 1         # add trailing quote only
                else:
                    return 0               # no match
            elif chx == '.':
                return bix + 1             # add trailing period
            elif not ellyChar.isLetter(chx):
                break                      # unrecognizable char for name
            bix += 1
#       print 'no closure'
        return 0
    else:
#       print 'find component in' , buffr[bix:]
        while bix < lnb:
            chx = buffr[bix]               # collect letters for name
#           print 'chx=' , chx
            if chx == "'":
                if bix + 2 < lnb:
                    chn = buffr[bix+1]
                    if ellyChar.isWhiteSpace(chn):
                        break
                    if chn == 's' and not ellyChar.isLetter(buffr[bix+2]):
                        break
            elif not ellyChar.isLetter(chx):
                if chx == '.':
                    bix += 1
#                   print 'increment bix=' , bix
                break
            bix += 1

        if bix == lnb:

#           print 'ran out of chars'
            return bix                     # running out of chars means match

        else:

#           getting here means that more text follows limit
#           and so we may have to pick up extra chars here

            chx = buffr[bix]
#           print 'next chx=' , chx , 'bix=' , bix
            if ellyChar.isWhiteSpace(chx) or chx == "'":
                return bix                 # component can be terminated by space or (')
            elif chx == ',':
                if cmma:
                    return bix + 1         #     or comma when sequence starts with comma
                else:
                    return bix             #              when there is no starting comma
            elif ellyChar.isLetter(chx):
                return bix                 #     or letter, implying previous char was '.'
            else:
                return 0                   # failure to find name limit
예제 #37
0
    def match(self, txt, pnc, ctx):
        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars leading up to punctuation char
            pnc   - punctuation char
            ctx   - next chars after punctuation

        returns:
            True on match, False otherwise
        """

        #       print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx )

        if matchtoo(txt, pnc, ctx):  # exception by complex match?
            return True
#       print ( 'matchtoo() returned False' )

        sep = ctx[0] if len(ctx) > 0 else ''
        if sep == ellyChar.THS:
            return True
        nxt = ctx[1] if len(ctx) > 1 else ''

        #       print ( 'lstg=' , self.lstg.keys() )
        if not pnc in self.lstg:  # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

        #       print ( len(lp) , 'patterns' )

        ltx = len(txt)  # current length of accumulated text so far
        ntr = 1
        while ntr <= ltx:
            if not ellyChar.isLetterOrDigit(txt[-ntr]):
                break
            ntr += 1
        nrg = ntr
        ntr -= 1  # available trailing chars for  wildcard * match

        while nrg <= ltx:
            c = txt[-nrg]
            if not ellyChar.isLetterOrDigit(
                    c) and not ellyChar.isEmbeddedCombining(c):
                #               print ( 'break at nrg=' , nrg , txt[-nrg] )
                break
            nrg += 1
        nrg -= 1  # end of range for all pattern matching

        #       print ( 'ntr=' , ntr , 'nrg=' , nrg )

        txt = txt[-nrg:]  # reset text to limit for matching
        ltx = len(txt)  # its new length

        #       print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' )

        for p in lp:  # try matching each listed exception pattern

            if p.left != None and len(p.left) > 0:

                pat = p.left
                star = pat[-1] == ellyWildcard.cALL
                n = len(
                    pat)  # it each pattern element matches one sequence char
                if star:  # except for a final wildcard *
                    #                   print ( 'pattern ending with *' )
                    n -= 1
                    #                   print ( 'ltx=' , ltx , 'n=' , n )
                    if ltx < n:
                        continue  # cannot match pattern properly
                    pat = pat[:-1]
                    t = txt[:n]
                else:
                    if ltx < n:
                        continue  # cannot match pattern properly
                    t = txt[-n:]

                if not ellyWildcard.match(pat, t, 0):
                    #                   print ( 'no possible pattern match' )
                    continue

                k = ltx - n  # extra chars beyond any match
                #               print ( 'k=' , k , 't=' , t )
                #               print ( 'txt=' , txt )
                #               print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' )
                #               print ( 'matches' , n , 'chars' )
                if not star and k > 0:
                    #                   print ( 'check text before [' , txt[-n] , ']' )
                    if ellyChar.isLetterOrDigit(txt[-n]):
                        c = txt[-n - 1]
                        #                       print ( 'preceding= [', c , ']' )
                        if ellyChar.isLetterOrDigit(c) or c == '&':
                            continue  # because break in text is required

#           print ( 'pat=' , ellyWildcard.deconvert(p.left) )
#           print ( 'n=' , n , 'ltx=' , ltx )
#           print ( 'txt=' , txt )

#           nc = '\\n' if nxt == '\n' else nxt
#           print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' )
#           print ( 'versus c=' , nc )

            rp = p.right
            if rp == [] or rp[0] == ellyWildcard.cALL:
                return True
            pcx = rp[0]
            if pcx == nxt:  # check for specific char after possible stop )
                #               print ( 'right=' , nxt )
                return True
            elif pcx == ellyWildcard.cALF:  # check for alphabetic
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True
            elif pcx == ellyWildcard.cDIG:  # check for numeric
                if ellyChar.isDigit(nxt):
                    #                   print ( 'right is numeric=' , nxt 0
                    return True
            elif pcx == ellyWildcard.cUPR:  # check for upper case
                if ellyChar.isUpperCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cLWR:  # check for lower case
                if ellyChar.isLowerCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cCAN:  # check for non-alphanumeric
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True

#       print ( "no matches" )
        return False
예제 #38
0
    def match(self, token):
        """
        compare an Elly token against a tree
        and possibly modify that token after a match
        arguments:
            self  -
            token - input Elly token
        returns:
            True on match, False otherwise
        """

        #       print ( token )

        rec = True  # recursion flag
        suc = False  # success   flag

        while rec:  # continue comparisons recursively while flag is True

            #           print ( 'token root=' , token.root )
            if len(token.root) < 3: break  # stop if token root is too short

            dlt = 0 if ellyChar.isLetter(token.root[0]) else -2

            seq = self.sequence(
                token.root) + [Bound]  # token sequence plus sentinel

            #           print ( type(self).__name__ ,'seq=' , seq )

            chs = seq[0]  # first char in sequence to match
            if not chs in self.indx:
                return suc

            nod = self.indx[chs]  # starting node in tree
            #           print ( 'start nod.id=' , nod.id , '/' , (Node.Ni - 1) )
            lvl = 0  # level in tree
            mst = []  # match stack
            lmt = len(seq) + dlt  # sequence length = maximum possible match
            #           print ( 'lmt=' , lmt , 'seq=' , seq )

            while True:
                #               print ( 'nod=' , nod )
                if nod.actns != None:  # at node with action?
                    mst.append([nod, lvl])  # if so, save it on stack
                lvl += 1
                #               print ( 'lvl=' , lvl )
                if lvl == lmt: break  # continue comparing to end of token
                ch = seq[lvl]
                #               print ( 'ch=' , ch , 'contn=' , nod.contn.keys() )
                if not ch in nod.contn: break  # quit on mismatch
                nod = nod.contn[ch]  # go down to next node in tree

#           for mr in mst:
#               print ( ' |' , mr[0].id , mr[1] )

            while len(mst) > 0:  # match stack empty?

                mr = mst.pop()  # if not, get most recent match
                nod = mr[0]
                nom = mr[1] + 1

                uch = seq[nom] if nom < lmt else '_'  # first unmatched char

                con = nod.condn  # node condition for accepting  match

                nln = lmt - nom
                nln += nod.delta(nln)  # how chars expected after action
                #               print ( 'lmt=' , lmt , 'nom=' , nom )
                #               print ( 'check rule=' , nod.id )
                #               print ( 'con=' , con , 'nln=' , nln )
                #               print ( 'uch=' , uch )

                if con != 0 and nln < 3:  # this must leave at least 2 letters
                    continue  #   plus sentinel!

#               print ( 'condition' )
                if con == 0:  # accept match with no action?
                    #                   print ( '0 condition' )
                    return suc  # if so, done

                elif con == 1:  # unconditionally accept?
                    break  # if so, act on this match

                elif con == 2:  # first unmatched is consonant?
                    if uch != '|' and not ellyChar.isVowel(uch):
                        break  # if so, act on match

                elif con == 3:  # first unmatched is consonant or U?
                    if not ellyChar.isStrictVowel(uch):
                        break  # if so, act on match

            else:  ## if loop NOT terminated by break, no acceptable match
                #               print ( 'suc=' , suc , '@' )
                return suc  # we are done

            suc = True  # note acceptable match

            #           print ( 'nod.id=' , nod.id , '/' , (Node.Ni - 1) )

            #
            # take action for longest accepted match
            #

            #           print ( '1 token=' , token )
            self.rewrite(token, nom, nod)  # take action for node
            rec = nod.actns.recur  # update recursion flag

#           print ( '2 token=' , token )
#           print ( 'rec=' , rec )

#       print ( 'suc=' , suc )
        return suc
예제 #39
0
def _limit ( buffr , hstry ):

    """
    get length of next possible name component in buffer

    arguments:
        buffr - list of chars
        hstry - how much matched already

    returns:
        number of chars in continuation of last component, 0 for no next component
    """

    lnb = len(buffr)
    if lnb == 0: return 0

    bix = 0
    quot = False                           # indicate component starting with "
    parn = False                           #                             with (
    cmma = False                           #                             with ,
#   print ( '_limit buffr=' , buffr , 'hstry=' , hstry )
    if buffr[0] == ',':                    # handle possible leading comma
        if hstry == 0 or lnb < 4: return 0
        bix += 1
        if ellyChar.isWhiteSpace(buffr[1]):
            bix += 1
        cmma = True
#       print ( 'for comma, bix=' , bix )

    if buffr[bix] == '(':                  # handle short name in parentheses
        bix += 1
        parn = True
    if buffr[bix] == '"':                  # handle short name in double quotes
        bix += 1
        quot = True
#       print ( 'parn=' , parn , 'quot=' , quot )
    if parn or quot:
#       print ( 'enclosed component from' , buffr[bix:] )
        while bix < lnb:                   # collect letters for name
            chx = buffr[bix]
            if ellyChar.isWhiteSpace(chx):
                break
            elif not quot and parn and chx == ')':
                return bix + 1             # add trailing parenthesis
            elif quot and chx == '"':
                if bix + 1 < lnb and parn and buffr[bix+1] == ')':
                    return bix + 2         # add trailing quote and parenthesis
                elif not parn:
                    return bix + 1         # add trailing quote only
                else:
                    return 0               # no match
            elif chx == '.':
                return bix + 1             # add trailing period
            elif not ellyChar.isLetter(chx):
                break                      # unrecognizable char for name
            bix += 1
#       print ( 'no closure' )
        return 0
    else:
#       print ( 'find component in' , buffr[bix:] )
        while bix < lnb:
            chx = buffr[bix]               # collect letters for name
#           print ( 'chx=' , chx )
            if chx == "'":
                if bix + 2 < lnb:
                    chn = buffr[bix+1]
                    if ellyChar.isWhiteSpace(chn):
                        break
                    if chn == 's' and not ellyChar.isLetter(buffr[bix+2]):
                        break
            elif not ellyChar.isLetter(chx):
                if chx == '.':
                    bix += 1
#                   print ( 'increment bix=' , bix )
                break
            bix += 1

        if bix == lnb:

#           print ( 'ran out of chars' )
            return bix                     # running out of chars means match

        else:

#           getting here means that more text follows limit
#           and so we may have to pick up extra chars here

            chx = buffr[bix]
#           print ( 'next chx=' , chx , 'bix=' , bix )
            if ellyChar.isWhiteSpace(chx) or chx == "'":
                return bix                 # component can be terminated by space or (')
            elif chx == ',':
                if cmma:
                    return bix + 1         #     or comma when sequence starts with comma
                else:
                    return bix             #              when there is no starting comma
            elif ellyChar.isLetter(chx):
                return bix                 #     or letter, implying previous char was '.'
            else:
                return 0                   # failure to find name limit
예제 #40
0
    def build ( self , inp ):

        """
        build tree logic from definition reader input

        arguments:
            self  -
            inp   - definition text for logic

        exceptions:
            TableFailure on error
        """

        if inp == None:
            return

        nerr = 0                   # error count

        # read in affixes and associated actions

        while True:

            line = inp.readline()  # next input line
            if line == u'':        # check for EOF
                break

            modf = ''
            elem = line.strip().lower().split(' ')
#           print 'elem=' , elem
            le = len(elem)
            if le < 4:
                nerr += 1
                print >> sys.stderr , "** affix error: incomplete input"
                print >> sys.stderr , "*  at: [" , line , "]"
                continue                  # skip incomplete line
            if le > 4:                    # affix mod specified?
                modf = elem.pop()         # if so, get it
#               print elem[0] , modf
            do = elem.pop()               # note main action 

            # get affix within definition line

            aff = list(elem.pop(0))       # affix as list of chars

            # check for proper form

            aff = self.sequence(aff)      # backward or forward  matching?
#           print 'aff=' , aff

            c = aff[0]                    # get first char to compare with
            aff = aff[1:]

            if not ellyChar.isLetter(c):  # affix starts with letter?
                nerr += 1
                print >> sys.stderr , "** affix error: must start with letter"
                print >> sys.stderr , "*  at: [" , line , "]"
                continue                  # ignore line

            if not c in self.indx:        # node not already in tree index?
                self.indx[c] = Node()     # add new node

            node = self.indx[c]

            for a in aff:                 # now check each successive char in affix
                if a in node.contn:
                    node = node.contn[a]  # go to existing node if found
                else:
                    new = Node()          # otherwise make new node
                    node.contn[a] = new   # and insert into tree
                    node = new            # and move down

            # at final node in tree logic

            node.condn = int(elem.pop(0)) # condition for match

            try:
                nsave = 0 if len(elem) == 0 else int(elem.pop())
            except ValueError , e:
                print >> sys.stderr , e
                print >> sys.stderr , "*  at: [" , line , "]"
                continue                  # ignore line
             
            resto = [ Add ]               # set to defaults
            recur = False                 #

            mode  = do[-1]                # kind of recursion
            rest  = do[:-1]               # added chars to fill out root
#           print 'mode=' + '<' + mode + '>' , 'rest=' , rest
            if mode == u'?':
                node.condn = 1
                resto = [ Fail ]          # will generate fatal error 
            else:
                if mode == ',':           # allow recursion?
                    recur = True          # if so, change default
                if len(rest) == 1 and rest[0] == '&':
                    resto = [ RestorE ]
                else:
                    resto += list(rest)

            if self.addn != None:
                resto.insert(1,self.addn) # insert AFTER first char of list
#           print 'resto=' , resto

            # insert action

            node.actns = Action(self,nsave,resto,recur,modf)
            node.tag()
예제 #41
0
    def _lookUpNext(self):
        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()  # skip leading spaces
        s = self.sbu.buffer
        #       print ( '_lookUp@0 buffer=' , s )

        if len(s) == 0:  # check for end of input
            return False  # if so, done

#       print ( 'in =' , str(self.sbu) )
        if self.trs != None:  # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print ( '_lookUp@1 buffer=' , self.sbu.buffer )
#       print ( 'macro expansion s[0]=' , s[0] )
        self.sbu.expand()  # apply macro substitutions
        #       print ( 'macro expanded  s[0]=' , s[0] )
        #       print ( '_lookUp@2 buffer=' , self.sbu.buffer )

        s = self.sbu.buffer

        #       print ( 'expanded len=' , len(s) )
        if len(s) == 0: return True  # macros can empty out buffer

        k = self.sbu.findBreak()  # find extent of first component for lookup
        if k == 0:
            k = 1  # must have at least one char in token

#       print ( 'break at k=' , k )
        kl = len(s)
        if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ':
            k += 1  # recognize possible prefix

#       print ( 'len(s)=' , kl , 'k=' , k , 's=', s )

#       print ( '_lookUp@3 buffer=' , self.sbu.buffer )
        mr = self._scanText(k)  # text matching in various ways
        mx = mr[0]  # overall maximum match length
        chs = mr[1]  # any vocabulary element matched
        suf = mr[2]  # any suffix removed in matching
        #       print ( '_lookUp@4 buffer=' , self.sbu.buffer )
        s = self.sbu.buffer
        #       print ( 'k=' , k )
        #       print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf )
        #       print ( 'len(s)=' , len(s) , 's=' , s )

        if (k < mx or k == mx and suf != ''
            ):  # next word cannot produce token as long as already seen?

            #           print ( 'queue:' , len(self.ptr.queue) )
            #           print ( 'chs=' , chs )
            if len(chs) > 0:  # any vocabulary matches?
                #               print ( 'put back' , suf , mx , s )
                self.sbu.skip(mx)  # if so, they supersede
                if suf != '':  # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print ( 'suf=' , suf )
            else:
                chs = self.sbu.extract(mx)

#           print ( 'extract chs=' , chs )
            to = ellyToken.EllyToken(chs)
            #           print ( 'token=' , str(to) )
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True  # must note suffix removal for token!
#           print ( 'only queue:' , len(self.ptr.queue) )
            return True

#       print ( 'mx=' , mx )
#       print ( 'plus queue:' , len(self.ptr.queue) )
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
        #       print ( 'wsk=' , wsk )
        rws = ''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap)
        if not found:
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(
                    rws.lower(), False, cap)
#       print ( rws , 'found in dictionary=' , found )
        if found or mx > 0:  # match found in dictionary or by text scan
            if not found:
                k = mx  # if by text scan, must make token longer
                rws = rws[:k]  # if mx > k
            self.sbu.skip(k)
            #           print ( 'next=' , self.sbu.buffer[self.sbu.index:] )
            #           print ( 'queue after =' , len(self.ptr.queue) )
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:  # change token to show suffix properly
                #               print ( 'suf=' , suf )
                cs = suf[1]  # first char in suffix after '-'
                rt = to.root  # this is a list!
                lk = -1  # start at last char in token
                while rt[lk] != cs:
                    lk -= 1
                sn = len(rt) + lk  # where to divide suffix from root
                #               print ( 'sn=' , sn , rt )
                to.root = rt[:sn]  # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:  # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':  # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print ( 'add' , str(to) )
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print ( '[' + rws + ']' , 'still unrecognized' )

        chx = rws[0]  # special hyphen check
        if chx == '-' and k > 1:
            #           print ( 'look in  internal dictionary' )
            if self.ptr.createPhrasesFromDictionary(chx, False, False):
                #               print ( 'found!' )
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to)  # add it to token list
                self.sbu.skip()  # remove from input
                return True

        to = self._extractToken(
            mx)  # single-word matching with analysis and lookup

        #       print ( 'extracted to=' , str(to) )
        if to == None:  # if no match, we are done and will return
            #           print ( 'mx=' , mx )
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print ( 'to=' , str(to) , 'len(s)=' , len(s) , s )
#       posn = self.ctx.countTokensInListing()
#       print ( 'at', posn , 'in token list' )
        self.ctx.addTokenToListing(to)  # add token to listing for sentence
        #       tol = self.ctx.getNthTokenInListing(-1)
        #       print ( 'last token root=' , tol.root )
        return True  # successful lookup
예제 #42
0
파일: ellyBase.py 프로젝트: prohippo/pyelly
    def _lookUpNext ( self ):

        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()          # skip leading spaces
        s = self.sbu.buffer
#       print '_lookUp@0 buffer=' , s

        if len(s) == 0:                # check for end of input
            return False               # if so, done

#       print 'in =' , unicode(self.sbu)
        if self.trs != None:           # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print '_lookUp@1 buffer=' , self.sbu.buffer
#       print 'macro expansion s[0]=' , s[0]
        self.sbu.expand()              # apply macro substitutions
#       print 'macro expanded  s[0]=' , s[0]
#       print '_lookUp@2 buffer=' , self.sbu.buffer

        s = self.sbu.buffer

#       print 'expanded len=' , len(s)
        if len(s) == 0: return True    # macros can empty out buffer

        k = self.sbu.findBreak()       # find extent of first component for lookup
        if k == 0:
            k = 1                      # must have at least one char in token

#       print 'break at k=' , k
        kl = len(s)
        if  k + 1 < kl and s[k] == '+' and s[k+1] == ' ':
            k += 1                     # recognize possible prefix

#       print 'len(s)=' , kl , 'k=' , k , 's=', s

#       print '_lookUp@3 buffer=' , self.sbu.buffer
        mr = self._scanText(k)         # text matching in various ways
        mx  = mr[0]                    # overall maximum match length
        chs = mr[1]                    # any vocabulary element matched
        suf = mr[2]                    # any suffix removed in matching
#       print '_lookUp@4 buffer=' , self.sbu.buffer
        s = self.sbu.buffer
#       print 'k=' , k
#       print 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf
#       print 'len(s)=' , len(s) , 's=' , s

        if ( k < mx or
             k == mx and suf != '' ):  # next word cannot produce token as long as already seen?

#           print 'queue:' , len(self.ptr.queue)
#           print 'chs=' , chs
            if len(chs) > 0:           # any vocabulary matches?
#               print 'put back' , suf , mx , s
                self.sbu.skip(mx)      # if so, they supersede
                if suf != '':          # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print 'suf=' , suf
            else:
                chs = self.sbu.extract(mx)
#               print 'extracted chs=' , chs
#           print 'token chs=' , chs
            to = ellyToken.EllyToken(chs)
#           print 'long token=' , unicode(to)
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True     # must note suffix removal for token!
#           print 'only queue:' , len(self.ptr.queue)
            return True

#       print 'mx=' , mx
#       print 'plus queue:' , len(self.ptr.queue)
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
#       print 'wsk=' , wsk
        rws = u''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
        if not found:
#           print 'not found, k=' , k
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
#       print 'found in dictionary=' , found
        if found or mx > 0:            # match found in dictionary or by text scan
            if not found:
                k = mx                 # if by text scan, must make token longer
                rws = rws[:k]          # if mx > k
            self.sbu.skip(k)
#           print 'next=' , self.sbu.buffer[self.sbu.index:]
#           print 'queue after =' , len(self.ptr.queue)
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:           # change token to show suffix properly
#               print 'suf=' , suf
                cs = suf[1]            # first char in suffix after '-'
                rt = to.root           # this is a list!
                lk = -1                # start at last char in token
                while rt[lk] != cs: lk -= 1
                sn = len(rt) + lk      # where to divide suffix from root
#               print 'sn=' , sn , rt
                to.root = rt[:sn]      # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:                      # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':         # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print 'add' , unicode(to)
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print '[' + rws + ']' , 'still unrecognized'

        chx = rws[0]                   # special hyphen check
        if chx == '-' and k > 1:
#           print 'look in  internal dictionary'
            if self.ptr.createPhrasesFromDictionary(chx,False,False):
#               print 'found!'
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to) # add it to token list
                self.sbu.skip()                # remove from input
                return True

        to = self._extractToken(mx)    # single-word matching with analysis and lookup

#       print 'extracted to=' , unicode(to)
        if to == None:                 # if no match, we are done and will return
#           print 'mx=' , mx
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print 'to=' , unicode(to) , 'len(s)=' , len(s) , s
#       posn = self.ctx.countTokensInListing()
#       print 'at', posn , 'in token list'
        self.ctx.addTokenToListing(to) # add token to listing for sentence
#       tol = self.ctx.getNthTokenInListing(-1)
#       print 'last token root=' , tol.root
        return True                    # successful lookup
예제 #43
0
    def match ( self , token ):
        """
        compare an Elly token against a tree
        and possibly modify that token after a match
        arguments:
            self  -
            token - input Elly token
        returns:
            True on match, False otherwise
        """

#       print token

        rec = True      # recursion flag
        suc = False     # success   flag

        while rec:      # continue comparisons recursively while flag is True

#           print 'token root=' , token.root
            if len(token.root) < 3: break               # stop if token root is too short

            dlt = 0 if ellyChar.isLetter(token.root[0]) else -2

            seq = self.sequence(token.root) + [ Bound ] # token sequence plus sentinel

#           print type(self).__name__ ,'seq=' , seq

            chs = seq[0]         # first char in sequence to match
            if not chs in self.indx:
                return suc

            nod = self.indx[chs] # starting node in tree
#           print 'start nod.id=' , nod.id , '/' , (Node.Ni - 1)
            lvl = 0              # level in tree
            mst = [ ]            # match stack
            lmt = len(seq) + dlt # sequence length = maximum possible match
#           print 'lmt=' , lmt , 'seq=' , seq

            while True:
#               print 'nod=' , nod
                if nod.actns != None:          # at node with action?
                    mst.append([ nod , lvl ])  # if so, save it on stack
                lvl += 1
#               print 'lvl=' , lvl
                if lvl == lmt: break           # continue comparing to end of token
                ch = seq[lvl]
#               print 'ch=' , ch , 'contn=' , nod.contn.keys()
                if not ch in nod.contn: break  # quit on mismatch
                nod = nod.contn[ch]            # go down to next node in tree

#           for mr in mst:
#               print ' |' , mr[0].id , mr[1]

            while len(mst) > 0:                # match stack empty?

                mr = mst.pop()                 # if not, get most recent match
                nod = mr[0]
                nom = mr[1] + 1

                uch = seq[nom] if nom < lmt else u'_' # first unmatched char

                con = nod.condn                # node condition for accepting  match

                nln = lmt - nom
                nln += nod.delta(nln)          # how chars expected after action
#               print 'lmt=' , lmt , 'nom=' , nom
#               print 'check rule=' , nod.id
#               print 'con=' , con , 'nln=' , nln
#               print 'uch=' , uch

                if con != 0 and nln < 3:       # this must leave at least 2 letters
                    continue                   #   plus sentinel!

#               print 'condition'
                if con == 0:                   # accept match with no action?
#                   print '0 condition'
                    return suc                 # if so, done

                elif con == 1:                 # unconditionally accept?
                    break                      # if so, act on this match

                elif con == 2:                 # first unmatched is consonant?
                    if uch != '|' and not ellyChar.isVowel(uch):
                        break                  # if so, act on match

                elif con == 3:                 # first unmatched is consonant or U?
                    if not ellyChar.isStrictVowel(uch)  :
                        break                  # if so, act on match

            else: ## if loop NOT terminated by break, no acceptable match
#               print 'suc=' , suc , '@'
                return suc                     # we are done

            suc = True                         # note acceptable match

#           print 'nod.id=' , nod.id , '/' , (Node.Ni - 1)

            #
            # take action for longest accepted match
            #

#           print '1 token=' , token
            self.rewrite(token,nom,nod)        # take action for node
            rec = nod.actns.recur              # update recursion flag

#           print '2 token=' , token
#           print 'rec=' , rec

#       print 'suc=' , suc
        return suc
예제 #44
0
    def simpleDeinflection(self, ss, ssp, ssl, mr):
        """
        handle matching of certain forms of English inflectional endings
        (override this method appropriately for other languages)

        arguments:
            self -
            ss   - input string of chars to scan for match
            ssp  - current position in input string
            ssl  - limit of matching in input
            mr   - list of chars to look for next in input

        returns:
            inflection char count >= 0 on match, -1 otherwise
        """

        #       print ( 'simpleDeinflection' , 'ssp=' , ssp , 'ssl=' , ssl )
        self.endg = ''  # null inflection by default
        if len(mr) == 0 and ssp == ssl:
            return 0
        if ssp < 2 or ss[ssp - 2] == ' ':
            return -1
        ts = ss[ssp:]  # where to look for inflection
        mc = ss[ssp - 1]  # last char matched
        lm = len(mr)
        #       print ( ts , 'mc=' , mc , 'mr=' , mr )
        if not ellyChar.isLetter(mc):
            return -1
        dss = ssl - ssp  # count up extra input chars#
        #       print ( 'dss=' , dss )
        if dss == 0:  # no more chars in input
            if lm == 0:  # check for exact match
                return 0
        elif dss == 1:  # just one char left in input
            if lm != 0:  # make sure all of pattern matched
                return -1
            elif ts[0] in APOs:
                if mc == 's':  # case of S'
                    self.endg = "-'s"
                    return 1
                else:
                    return 0
            elif ts[0].lower() == 's':
                self.endg = '-s'  # assume extra input S is for plural
                return 1
            elif mc == 'e' and ts[0].lower() == 'd':
                self.endg = '-ed'  # an E was last matched char
                return 1
            elif ts[0] == '.':
                return 0  # but no inflection
        elif dss == 2:  # 2 extra chars
            #           print ( 'ts=' , ts )
            if lm == 0:
                if ts[0].lower() == 'e':
                    if ts[1].lower() == 'd':
                        self.endg = '-ed'  # E and D must be inflection
                        return 2
                    elif ts[1].lower() == 's':
                        self.endg = '-s'  # assume E is extra
                        return 2
                elif ts[0] in APOs and ts[1].lower() == 's':
                    #                   print ( "ending -'s" )
                    ss[ssp] = "'"  # normalization just in case
                    self.endg = "-'s"
                    return 2
                elif ts[1] in APOs and ts[0].lower() == 's':
                    #                   print ( "endings -s and -'s" )
                    ss[ssp] = "'"  # reverse letters in next input
                    ss[ssp + 1] = "s"  #
                    self.endg = '-s'
                    return 0
        elif dss == 3:  # 3 extra chars
            #           print ( 'ts=' , ts , 'mr=' , mr )
            if ts[0].lower() == 'i':
                if ts[1].lower() == 'e':
                    if lm == 1 and mr[0] == 'y':
                        if ts[2].lower() == 's':
                            self.endg = '-s'
                            return 3
                        elif ts[2].lower() == 'd':
                            self.endg = '-ed'
                            return 3
                elif ts[1].lower() == 'n' and ts[2].lower() == 'g':
                    if lm == 0 or lm == 1 and mr[0] == 'e':
                        self.endg = '-ing'
                        return 3
            if lm == 0 and ts[0].lower() == mc and ts[1].lower(
            ) == 'e' and ts[2].lower() == 'd':
                self.endg = '-ed'
                return 3
        elif dss == 4:  # 4 extra chars
            if lm == 0 and ts[0].lower() == mc and ts[1] == 'i' and ts[
                    2].lower() == 'n' and ts[3].lower() == 'g':
                self.endg = '-ing'
                return 4
            if ts[0].lower() == 'y' and ts[1].lower() == 'i' and ts[2].lower(
            ) == 'n' and ts[3].lower() == 'g':
                if lm == 2 and mr[0] == 'i' and mr[1] == 'e':
                    self.endg = '-ing'
                    return 4

        return -1  # extra chars not inflection
예제 #45
0
def scan ( buffr ):

    """
    recognize personal names in text at current position

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    def doLook ( mth , itm ):

        """
        do lookup with specified method using
        global variables in Python 2.7.*

        arguments:
            mth  - name table method
            itm  - string to look up
        """

        global _typ , _nch            # really need nonlocal
        _typ = mth(itm)
        if _typ < 0 and len(itm) > 3: # if no match, check for final '.'
            if itm[-1] == '.':
                _typ = mth(itm[:-1])
                if _typ >= 0:
                    _nch -= 1         # match without '.'

    global _typ , _nch
    global _toscan

#   print 'table=' , _table
    bln = len(buffr)
    if _table == None or bln < 2: return 0
    if _toscan > 0:
        if bln > _toscan:
            return 0
        else:
            _toscan = 0

    chx = buffr[0]
#   print 'scan chx=' , chx
    if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0

    cmps = [ ]                                 # name components this time
    ncmp = 0                                   # number of components for current name
    ninf = 0                                   # number inferred
    ntyp = len(nameTable.TYP)
    stat = [False]*ntyp                        # define state for getting personal name
    mlen = 0                                   # last match length

    bix = 0                                    # buffer index to advance in scanning
    _typ = -1
    while bix < bln:
        ltyp = -1                              # last match type
        _nch = _limit(buffr[bix:],mlen)        # length of next possible name component
#       print 'top _nch=' , _nch
        if _nch == 0: return 0
        elm = _extract(buffr[bix:],_nch)       # get possible component as string
        sch = buffr[bix]
        enclosed = (sch == '(' or sch == '"')  # type of next element
        doLook(_table.lookUp,elm)              # look it up in saved name table
#       print 'lookUp(' , elm , ')=' , _typ

        if _typ < 0:
            if _typ == nameTable.REJ:
                return 0                       # immediate rejection of any match
            if _typ == nameTable.STP:
                break                          # stop any more matching
            if elm[-1] == '.':                 # drop any trailing '.'
                elm = elm[:-1]
                if not enclosed:
                    _nch -= 1
            if enclosed:                       # enclosed element assumed to be name
                if not elm in _cntxt:
                    _cntxt.append(elm)         # make sure always to save in local context
                    ninf += 1                  # this is inferred!
            if elm in _cntxt:
                _typ = nameTable.XNM           # neutral name type to be noncommital

        if _typ < 0:
            tok = buffr[bix:bix + _nch]        # unknown token to check
#           print 'call infer with tok=' , tok
            if infer(tok):
#               print 'digraph test passed'
                _typ = nameTable.XNM           # neutral name type inferred
                if not _table.checkPhonetic(tok):
                    ninf += 1                  # count inferred component if no phonetic support
#           print '_typ=' , _typ

        if nameTable.starts(_typ) and bix > 0: # if component not at start of name,
            break                              #     must stop name scan

#       print 'continuing bix=' , bix
        while _typ >= 0:                       # continue as long as match is viable
            ncmp += 1                          # count up component
            cmps.append(elm)                   # save component
            bix += _nch                        # move ahead in scan
#           print 'bix=' , bix
            if _typ > 0:
#               print '_typ=' , _typ
                if stat[_typ]:                 # check for duplication of component type
                    if (ltyp >= 0 and
                        ltyp != _typ):         # allowed only if duplicate is consecutive
                        break
                mlen = bix                     # save index on actual match
                ltyp = _typ

            if nameTable.ends(_typ):           # if component marks end of name,
                break                          #    must stop name scan

            stat[_typ] = True                  # update match state
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component

            _nch = _limit(buffr[bix:],mlen)    # length of next possible name component
            if _nch == 0: break
            elm = _extract(buffr[bix:],_nch)   # get possible next component as string
            doLook(_table.lookUpMore,elm)      # look it up in saved name table
#           print 'lookUpMore(' , elm , ')=' , _typ

        if _typ < 0:                           # while-loop terminated without break
#           print 'ltyp=' , ltyp , 'mlen=' , mlen
            if ltyp < 0 or mlen == 0: break
            bix = mlen                         # restart at end of last match
            if bix == bln: break
            if ellyChar.isWhiteSpace(buffr[bix]):
                bix += 1                       # skip any space to start of next component
            continue

        break

#
#
#### additional constraints on acceptable personal name
#
#   print 'checking ltyp=' , ltyp
    if (ltyp == nameTable.CNJ or
        ltyp == nameTable.REL):                # a name cannot end with these types
        mlen -= _nch                           # have to drop them from any match
        if mlen == 0: return 0
        if ellyChar.isWhiteSpace(buffr[mlen-1]):
            mlen -= 1
        ncmp -= 1
        cmps.pop()

#   print 'ncmp=' , ncmp

    if ncmp == 0:                              # nothing matched?
        _planAhead(buffr)                      # check for possible problems in next scan
        return 0

#   print 'cmps=' , cmps
    if ncmp == ninf:
        return 0                               # name cannot be purely inferred

#   print 'ncmp=' , ncmp
    if ncmp == 1:                              # single-component name must be known or contextual
        if (not stat[nameTable.SNG] and
            not cmps[0] in _cntxt):
            return 0

#   print 'stat=' , stat[3:7]
    expl = (stat[nameTable.PNM] or             # name must have a substantial component
            stat[nameTable.SNM] or
            stat[nameTable.XNM] or
            stat[nameTable.SNG])

#   print 'expl=' , expl
    if (not expl and
        not (stat[nameTable.TTL] and           # or it could have just a title
             stat[nameTable.INI])):            #    and an initial
        return 0
#
####

#   print 'accepted mlen=' , mlen
    for cmpo in cmps:                          # if whole name is OK,
        if not cmpo in _cntxt:                 #    remember all components
            _cntxt.append(cmpo)                #    not already listed in context

    return mlen                                # will be > 0 on successful match
예제 #46
0
    def _matchAN ( self , ts ):

        """
        apply logic for alphanumeric date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

#       print 'ALPHANUMERIC'

        t = ts
        tl = len(ts)
        k = self._aMonth(t)            # look for month to start date string
        comma = False
#       print 'month len=' , k
        if k > 0:
            if k == tl: return 0
            if not ellyChar.isWhiteSpace(t[k]): return 0
            k += 1                     # skip space after month
            if k == tl: return 0
            t = t[k:]
            k = self._aDay(t)          # look for day of month
#           print 'day len=' , k
            if k == 0:
                self._dy = [ ]
                k = self._aYear(t)     # look for year immediately following
                if k > 0:
                    return tl - len(t) + k
                else:
                    return 0
#           print 'ts=' , ts
            tl = len(t)                # _aDay may have rewritten alphabetic day
            t = t[k:]
            if len(t) == 0:
#               print 'no year tl=' , tl , 'k=' , k , t
                return len(ts) - tl + k
            if t[0] == u',':           # look for comma after day
                t = t[1:]             # if found, remove and note
                comma = True
            if len(t) == 0: return tl
            if ellyChar.isWhiteSpace(t[0]): t = t[1:]
            if len(t) == 0: return tl
            k = self._aYear(t)         # look for year
#           print 'year len=' , k
            lnt = len(t)
            if comma and k < lnt and t[k] == ',':
                k += 1                 # remove comma after year if paired
#           print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t
            return len(ts) - len(t) + k

        k = self._aDay(t)              # look for day of month to start date string
#       print 'start day len=' , k
        if k == 0:
            self._dy = [ ]
        elif k > 0 and k < tl:         # cannot be just bare number by itself
            tl = len(ts)               # _aDay may have rewritten alphabetic day
            t = t[k:]
#           print 'new t=' , t
            if (k > 2 and len(t) > 2 and
                t[0] == u' ' and
                t[1].upper() == 'O' and
                t[2].upper() == 'F'):
                t = t[3:]              # to handle day reference like '4th of'
            if len(t) == 0: return 0
            if not ellyChar.isWhiteSpace(t[0]): return 0
            t = t[1:]
            k = self._aMonth(t)        # look for month
            if k == 0: return 0
            t = t[k:]
            if len(t) == 0: return tl
            ntl = tl - len(t)
#           print 'ntl=' , ntl
            nd = 0
            if t[0] == u',':           # look for comma after month
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
                comma = True
            if ellyChar.isWhiteSpace(t[0]):
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            k = self._aYear(t)         # look for year
            if k > 0:
                if comma and k < len(t) and t[k] == ',': k += 1
                return ntl + k + nd    # full date found
            else:
                return ntl - nd        # only month and day of date found

#       print 'look for year only in' , t
        k = self._aYear(t)
        if k > 0:
            if k == tl:
                return k
            elif not ellyChar.isLetter(t[k]) and t[k] != '-':
                return k

        return 0                       # nothing found
예제 #47
0
    def __init__(self, inpr):
        """
        define table from text input

        arguments:
            self  -
            inpr  - EllyDefinitionReader

        throws:
            TableFailure on table definition failure
        """

        self.pres = {}
        self.posts = {}
        self.dictn = {}
        self.phone = []
        self.compn = ''
        self._nerr = 0

        #       print 'TYP=' , TYP

        while True:
            lin = inpr.readline().lower()  # ignore capitalization
            if len(lin) == 0: break

            if lin[0] == '=':  # phonetic entry?
                lin = lin[1:]  # if so, remove marker
                first = ''
                if lin[0] == 'a':  # vowel is first?
                    first = 'a'  # if so, remove it
                    lin = lin[1:]
                pho = first + lin.upper(
                )  # combine any vowel with uppercase rest
                self.phone.append(pho)  # save in phonetic list
                continue

            lins = lin.strip().split(':')

            if len(lins) != 2:  # type definition must have two parts
                self._err(lne=lin)
                continue

            typ = lins[1].strip()  # get component type
            if not typ in TYP:
                self._err('bad name component type', lin)
                continue
            cod = TYP[typ]

            els = lins[0].strip().split(' ')  # name component

            #           print 'type=' , '"' + typ + '"' , els

            lim = len(els)

            if lim == 1:
                cmpo = els[0]
                chf = cmpo[0]  # first char of component
                chl = cmpo[-1]  # last  char
                if chf == '-' or chf == '+':
                    if not ellyChar.isLetter(chl) or len(cmpo) < 3:
                        self._err('bad end of name', lin)
                        continue
                    dky = cmpo[-2:]  # dictionary key is 2 chars only
                    if not dky in self.posts:
                        self.posts[dky] = []
                    self.posts[dky].append([cmpo[1:], cod, (chf == '+')])
                elif chl == '-' or chl == '+':
                    if not ellyChar.isLetter(chf) or len(cmpo) < 3:
                        self._err('bad start of name', lin)
                        continue
                    dky = cmpo[:2]  # dictionary key is 2 chars only
                    if not dky in self.pres:
                        self.pres[dky] = []
                    self.pres[dky].append([cmpo[:-1], cod, (chl == '+')])
                else:
                    self.dictn[cmpo] = cod
                    if cmpo[-1] == '.':  # if ending with '.' , also save without
                        self.dictn[cmpo[:-1]] = cod
                continue

            Nix = 1
            while Nix <= lim:  # process elements of name component
                cmpo = ' '.join(els[0:Nix])
                Nix += 1
                if cmpo not in self.dictn:  # first Nix elements
                    self.dictn[cmpo] = CND
            if self.dictn[cmpo] != CND:
                self._err('name component redefined', lin)
                continue
            self.dictn[cmpo] = TYP[typ]  # put into table

        if self._nerr > 0:
            print >> sys.stderr, '**', self._nerr, 'name errors in all'
            print >> sys.stderr, 'name table definition FAILed'
            raise ellyException.TableFailure
예제 #48
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  +
                     ',ns=' + unicode(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw)
#       print "_span: txt @",offs,"pat @",mp,"nsp=",nsp
#       print "text to span:",text[offs:]
#       print "pat rest=" , patn[mp:]
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print "exclude=",k,"chars from possible span for rest of pattern"

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print mx,"chars available to scan"
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print 'span c=' , c
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print 'starting match, limt=',limt,text[offs:limt],":",patn
#   print 'nsps=' , nsps

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print '---- loop mp=' , mp , 'ml=' , ml
        while mp < ml:
            if offs >= limt:
#               print "offs=",offs,"limt=",limt
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print 'patn=' , patn
            mc = patn[mp]
#           print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs
#           print 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')'
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print 'hyphen special matching, limt=', limt , 'offs=' , offs
#                       print 'text[offs:]=' , text[offs:]
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print 'no special matching of hyphen'
                        break

#           print 'matched @mp=' , mp
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat @',mp,"<",ml
#       print "txt @",offs,'<',limt,'last=',last
#       print '@',offs,text[offs:]

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc)

        if tc == cALL:      # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print "offs=",offs,'nm=',nm
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print "ANY:",last,offs
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print 'at cCAN'
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print "UPR:",last,'@',offs
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print "LWR:",last,'@',offs
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:","["+last+"]"
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print 'NO space'

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')'
            if last != '':               # still more to match?
                offs -= 1
#               print 'nsps=' , nsps
#               print '@' , offs , text
                nm = _span(tc,nsps)      # maximum match possible

#               print 'spanning=' , nm
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print 'offs=' , offs
                    last = text[offs] if offs < limt else ''
                    continue
#           print 'fail tc=' , deconvert(tc)

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print "fail - unwinding" , unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted
#       print 'cnt=' , uf.count , 'off=' , offs

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating consecutive bindings"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    return mbd             # consolidated bindings plus new offset
예제 #49
0
    def match ( self , txt , pnc , ctx ):

        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars leading up to punctuation char
            pnc   - punctuation char
            ctx   - next chars after punctuation

        returns:
            True on match, False otherwise
        """

#       print 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx

        if matchtoo(txt,pnc,ctx):     # exception by complex match?
            return True
#       print 'matchtoo() returned False'

        sep = ctx[0] if len(ctx) > 0 else ''
        if sep == ellyChar.THS:
            return True
        nxt = ctx[1] if len(ctx) > 1 else ''

#       print 'lstg=' , self.lstg.keys()
        if not pnc in self.lstg:     # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

#       print len(lp) , 'patterns'

        ltx = len(txt)               # current length of accumulated text so far
        ntr = 1
        while ntr <= ltx:
            if not ellyChar.isLetterOrDigit(txt[-ntr]):
                break
            ntr += 1
        nrg = ntr
        ntr -= 1                     # available trailing chars for  wildcard * match

        while nrg <= ltx:
            c = txt[-nrg]
            if not ellyChar.isLetterOrDigit(c) and not ellyChar.isEmbeddedCombining(c):
#               print 'break at nrg=' , nrg , txt[-nrg]
                break
            nrg += 1
        nrg -= 1                     # end of range for all pattern matching

#       print 'ntr=' , ntr , 'nrg=' , nrg

        txt = txt[-nrg:]             # reset text to limit for matching
        ltx = len(txt)               # its new length

#       print 'txt= ' + unicode(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']'

        for p in lp:                 # try matching each listed exception pattern

            if p.left != None and len(p.left) > 0:

                pat = p.left
                star = pat[-1] == ellyWildcard.cALL
                n = len(pat)         # it each pattern element matches one sequence char
                if star:             # except for a final wildcard *
#                   print 'pattern ending with *'
                    n -= 1
#                   print 'ltx=' , ltx , 'n=' , n
                    if ltx < n:
                        continue     # cannot match pattern properly
                    pat = pat[:-1]
                    t = txt[:n]
                else:
                    if ltx < n:
                        continue     # cannot match pattern properly
                    t = txt[-n:]

                if not ellyWildcard.match(pat,t,0):
#                   print 'no possible pattern match'
                    continue

                k = ltx - n          # extra chars beyond any match
#               print 'k=' , k , 't=' , t
#               print 'txt=' , txt
#               print 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']'
#               print 'matches' , n , 'chars'
                if not star and k > 0:
#                   print 'check text before [' , txt[-n] , ']'
                    if ellyChar.isLetterOrDigit(txt[-n]):
                        c = txt[-n-1]
#                       print 'preceding= [', c , ']'
                        if ellyChar.isLetterOrDigit(c) or c == '&':
                            continue # because break in text is required

#           print 'pat=' , ellyWildcard.deconvert(p.left)
#           print 'n=' , n , 'ltx=' , ltx
#           print 'txt=' , txt

#           nc = '\\n' if nxt == '\n' else nxt
#           print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']'
#           print 'versus c=' , nc

            rp = p.right
            if rp == [] or rp[0] == ellyWildcard.cALL:
                return True
            pcx = rp[0]
            if pcx == nxt:                     # check for specific char after possible stop
#               print 'right=' , nxt
                return True
            elif pcx == ellyWildcard.cALF:     # check for alphabetic
                if ellyChar.isLetter(nxt):
#                   print 'right is alphabetic=' , nxt
                    return True
            elif pcx == ellyWildcard.cDIG:     # check for numeric
                if ellyChar.isDigit(nxt):
#                   print 'right is numeric=' , nxt
                    return True
            elif pcx == ellyWildcard.cUPR:     # check for upper case
                if ellyChar.isUpperCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cLWR:     # check for lower case
                if ellyChar.isLowerCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cCAN:     # check for non-alphanumeric
                if ellyChar.isLetter(nxt):
#                   print 'right is alphabetic=' , nxt
                    return True

#       print "no matches"
        return False
예제 #50
0
    def _matchAN ( self , ts ):

        """
        apply logic for alphanumeric date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

#       print 'ALPHANUMERIC'

        t = ts
        tl = len(ts)
        k = self._aMonth(t)            # look for month to start date string
        if k > 0:
            if k == tl: return 0
            if not ellyChar.isWhiteSpace(t[k]): return 0
            k += 1                     # skip space after month
            if k == tl: return 0
            t = t[k:]
            k = self._aDay(t)          # look for day of month
            if k == 0: return 0
            tl = len(ts)               # _aDay may have rewritten alphabetic day
            t = t[k:]
            if len(t) == 0: return 0
            if t[0] == u',': t = t[1:] # look for comma after day
            if len(t) == 0: return tl
            if ellyChar.isWhiteSpace(t[0]): t = t[1:]
            if len(t) == 0: return tl
            k = self._aYear(t)         # look for year
            return tl - len(t) + k

        k = self._aDay(t)              # look for day of month to start date string
        if k > 0 and k < tl:           # cannot be just bare number by itself
            tl = len(ts)               # _aDay may have rewritten alphabetic day
            t = t[k:]
#           print 'new t=' , t
            if (k > 2 and len(t) > 2 and
                t[0] == u' ' and
                t[1].upper() == 'O' and
                t[2].upper() == 'F'):
                t = t[3:]              # to handle day reference like '4th of'
            if len(t) == 0: return 0
            if not ellyChar.isWhiteSpace(t[0]): return 0
            t = t[1:]
            k = self._aMonth(t)        # look for month
            if k == 0: return 0
            t = t[k:]
            if len(t) == 0: return tl
            ntl = tl - len(t)
#           print 'ntl=' , ntl
            nd = 0
            if t[0] == u',':           # look for comma after month
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            if ellyChar.isWhiteSpace(t[0]):
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            k = self._aYear(t)         # look for year
            if k > 0:
                return ntl + k + nd    # full date found
            else:
                return ntl - nd        # only month and day of date found

#       print 'look for year only in' , t
        k = self._aYear(t)
        if k > 0:
            if k == tl:
                return k
            elif not ellyChar.isLetter(t[k]) and t[k] != '-':
                return k

        return 0                       # nothing found
예제 #51
0
    def _matchAN(self, ts):
        """
        apply logic for alphanumeric date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        #       print 'ALPHANUMERIC'

        t = ts
        tl = len(ts)
        k = self._aMonth(t)  # look for month to start date string
        comma = False
        #       print 'month len=' , k
        if k > 0:
            if k == tl: return 0
            if not ellyChar.isWhiteSpace(t[k]): return 0
            k += 1  # skip space after month
            if k == tl: return 0
            t = t[k:]
            k = self._aDay(t)  # look for day of month
            #           print 'day len=' , k
            if k == 0:
                self._dy = []
                k = self._aYear(t)  # look for year immediately following
                if k > 0:
                    return tl - len(t) + k
                else:
                    return 0
#           print 'ts=' , ts
            tl = len(t)  # _aDay may have rewritten alphabetic day
            t = t[k:]
            if len(t) == 0:
                #               print 'no year tl=' , tl , 'k=' , k , t
                return len(ts) - tl + k
            if t[0] == u',':  # look for comma after day
                t = t[1:]  # if found, remove and note
                comma = True
            if len(t) == 0: return tl
            if ellyChar.isWhiteSpace(t[0]): t = t[1:]
            if len(t) == 0: return tl
            k = self._aYear(t)  # look for year
            #           print 'year len=' , k
            lnt = len(t)
            if comma and k < lnt and t[k] == ',':
                k += 1  # remove comma after year if paired
#           print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t
            return len(ts) - len(t) + k

        k = self._aDay(t)  # look for day of month to start date string
        #       print 'start day len=' , k
        if k == 0:
            self._dy = []
        elif k > 0 and k < tl:  # cannot be just bare number by itself
            tl = len(ts)  # _aDay may have rewritten alphabetic day
            t = t[k:]
            #           print 'new t=' , t
            if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O'
                    and t[2].upper() == 'F'):
                t = t[3:]  # to handle day reference like '4th of'
            if len(t) == 0: return 0
            if not ellyChar.isWhiteSpace(t[0]): return 0
            t = t[1:]
            k = self._aMonth(t)  # look for month
            if k == 0: return 0
            t = t[k:]
            if len(t) == 0: return tl
            ntl = tl - len(t)
            #           print 'ntl=' , ntl
            nd = 0
            if t[0] == u',':  # look for comma after month
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
                comma = True
            if ellyChar.isWhiteSpace(t[0]):
                t = t[1:]
                if len(t) == 0: return tl
                nd += 1
            k = self._aYear(t)  # look for year
            if k > 0:
                if comma and k < len(t) and t[k] == ',': k += 1
                return ntl + k + nd  # full date found
            else:
                return ntl - nd  # only month and day of date found

#       print 'look for year only in' , t
        k = self._aYear(t)
        if k > 0:
            if k == tl:
                return k
            elif not ellyChar.isLetter(t[k]) and t[k] != '-':
                return k

        return 0  # nothing found
예제 #52
0
    def lookUp ( self , chrs , keyl ):

        """
        look for terms in vocabulary at current text position

        arguments:
            self  -
            chrs  - text char list
            keyl  - number of initial chars to use a DB key

        returns:
            list of tuples [  VocabularyElement , Result ], possibly empty
        """

        res = [ ]                     # result list initially empty
        rln = 0

        if len(chrs) == 0:
            return res                # empty list at this point

#       print 'chrs=' , type(chrs) , type(chrs[0])

        if keyl < 1:
            return res                # still empty list

        strg = toKey(chrs[:keyl])

#       print 'vocab first word=' , list(strg) , type(strg)

        vs = self._getDB(strg)        # look up first word in vocabulary table

        if vs == None or len(vs) == 0:
            return res

#       print len(vs) , 'raw entries found'

        lm = len(chrs)                # total length of text for lookup

        for v in vs:                  # look at possible vocabulary matches

#           print 'entry=' , v

            ln = v.length()           # total possible match length for vocabulary entry

#           print 'rln=' , rln , 'ln=' , ln , 'lm=' , lm

            if ln  > lm:              # must be enough text in entry to match
                continue

            k = ln
            while k < lm:
                chrsk = chrs[k]
                if not ellyChar.isLetter(chrsk) and chrsk != '\'': break
                k += 1
#           print 'k=' , k
#           print v.chs , ':' , chrs[:k]
            nm = self.doMatchUp(v.chs,chrs)
            if nm == 0 or nm < rln: continue

#           print 'rln=' , rln , 'ln=' , ln
            if rln < nm:              # longer match than before?
#               print 'new list'
                res = [ ]             # if so, start new result list for longer matches
                rln = nm              # set new minimum match length

#           print 'returning' , v.chs , nm , self.endg
            rs = Result(v,nm,self.endg)   # new result object be returned
            res.append(rs)            # add to current result list

        return res                    # return surviving matches
예제 #53
0
def match ( patn , text , offs=0 , limt=None ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit of matching

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # three private functions using local variables of match()
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        return uf

    def _span ( typw ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
        returns:
            non-negative count if any match possible, otherwise -1
        """
        k = minMatch(patn[mp:])  # calculate min char count to match rest of pattern

#       print "exclude=",k,"@",offs

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # char type matching a wildcard

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match?

    if limt == None: limt = len(text)

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit

#   print text[offs:limt],":",list(patn)

    while True:

        ## literally match as many next chars as possible

        while mp < ml:
            if offs >= limt:
                last = ''
            else:
                last = text[offs].lower()
                offs += 1
#           print 'matching last=' , last , 'at' , offs
            if patn[mp] != last: break
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat',mp,"<",ml
#       print "txt @",offs

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",ord(tc)

        if tc == cALL:   # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1  # get new binding record
            bf[0] = offs              # bind from current offset
            offs += nm                # move offset past end of span
            bf[1] = offs              # bind to   new     offset
#           print "offs=",offs
            uf = _mark(1); unj += 1   # get new unwinding record
            uf.count = nm             # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last in [ '.' , ',' , '-' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:"
            if last != '' and ellyChar.isWhiteSpace(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1   # dummy record to block
            mf[0] = -1                #   later binding consolidation
            if last != '':
                offs -= 1             # try for rematch
            m = mp                    # find corresponding EOS
            while m < ml:             #
                if patn[m] == cEOS: break
                m += 1
            else:                     # no EOS?
                m -= 1                # if so, pretend there is one anyway
            uf = _mark(0); unj += 1   # for unwinding on any later match failure
            uf.pats = m + 1           # i.e. one char past next EOS
            uf.txts = offs            # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1             # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
            if last != '':            # still more to match?
                offs -= 1
                nm = _span(tc)        # maximum match possible
#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
                    continue

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch

#       print "fail - unwinding",unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    return mbd             # consolidated bindings plus new offset
예제 #54
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset
예제 #55
0
    def lookUp(self, chrs, keyl):
        """
        look for terms in vocabulary at current text position

        arguments:
            self  -
            chrs  - text char list
            keyl  - number of initial chars to use a DB key

        returns:
            list of tuples [  VocabularyElement , Result ], possibly empty
        """

        #       print ( 'lookUp: chrs=' , chrs , 'keyl=' , keyl )

        res = []  # result list initially empty
        rln = 0

        if len(chrs) == 0:
            return res  # empty list at this point

#       print ( 'chrs=' , type(chrs) , type(chrs[0]) )

        if keyl < 1:
            return res  # still empty list

        if ellyConfiguration.language == 'ZH':
            strg = toKeyZH(chrs[0])
        else:
            strg = toKey(chrs[:keyl])

#       print ( 'vocab search key=' , list(strg) , type(strg) )
#       print ( '0 endg=' , self.endg )

#       print ( listDBKeys(self.cdb) )

        vs = self._getDB(strg)  # look up first word in vocabulary table
        #       print ( '1 endg=' , self.endg )

        if vs == None or len(vs) == 0:
            return res

#       print ( len(vs) , 'raw entries found' )

        lm = len(chrs)  # total length of text for lookup

        for v in vs:  # look at possible vocabulary matches

            #           print ( 'entry=' , v )

            ln = v.length()  # total possible match length for vocabulary entry

            #           print ( 'rln=' , rln , 'ln=' , ln , 'lm=' , lm )

            if ln > lm:  # must be enough text in entry to match
                continue

            k = ln
            while k < lm:
                chrsk = chrs[k]
                if not ellyChar.isLetter(chrsk) and chrsk != '\'': break
                k += 1
#           print ( 'k=' , k )
#           print ( v.chs , ':' , chrs[:k] )
            nm = self.doMatchUp(v.chs, chrs)
            if nm == 0 or nm < rln: continue
            #           print ( '2 endg=' , self.endg )

            #           print ( 'rln=' , rln , 'ln=' , ln )
            if rln < nm:  # longer match than before?
                #               print ( 'new list' )
                res = []  # if so, start new result list for longer matches
                rln = nm  # set new minimum match length


#           print ( 'returning' , v.chs , nm , '<' + self.endg + '>' )
            rs = Result(v, nm, self.endg)  # new result object be returned
            #           print ( 'rs=' , rs )
            res.append(rs)  # add to current result list

        rem = []
        #       print ( 'len(res)=' , len(res) )
        if len(res) > 1:  # check for special case where term with and
            for re in res:  # without inflection is in vocabulary
                #               print ( 're=' , re )
                if len(re.suffx) == 0:
                    rem.append(re)
        if len(rem) > 0:
            return rem  # if so, keep only full matches
        else:
            return res  # return surviving matches
예제 #56
0
    def apply ( self , token , extra=None ):

        """
        apply inflectional stemming logic against token

        arguments:
            self  -
            token - input token
            extra - extra token char for any restoration
        returns:
            status code
        exceptions:
            StemmingError
        """

        last = None             # save last popped letter

        if len(self.table) < 2: # check for empty table
            return isNOTM       # if so, no match
#       print 'at' , self.table[0] , extra
        it = 0                  # stemming logic index
        word = token.root       # list of letters in token word
        m  = len(word)          # end of word
        seq = self.table[it]    # suffix to match
        it += 1                 #
        n  = len(seq)           # ending length to match
        if n >= m:              # 
            return isNOTM       # word not long enough for ending
        msh = m - n

        # check that table is right one for word ending

        ew = m                  # just past end of token word

#       print "suffix length= ", n, ", word length= ", m
        if n > 0:
            for ix in range(n):
                ew -= 1
#               print word[ew], " cmp ", seq[ix]
                if word[ew] != seq[ix]:
                    return isNOTM
            ew -= 1
#           print "first char before suffix=" ,
#           print '[' +  ( word[ew] if ew >= 0 else None ) + ']'
        
        # interpret table logic

        last = seq[-1] if n > 0 else extra
        word = word[:msh]                # copy of word up to removed suffix
#       print 'word=' , word
        if not ellyChar.isLetter(word[-1]):
            return isNOTM

        while True:                      # advance through logic until success or failure

            opcode = self.table[it]      # next operation code to interpret
            it += 1                      #
#           print "opcode=", opcode

            if opcode < 0:               # YE(S) on match with possible modifications

                # word satisfies conditions for ending removal

                word = token.root[:msh]             # word without ending
#               print 'word=', word
#               print 'add or drop extra chars'
                nm = YE - opcode                    # get removal count from opcode
#               print 'nm=', nm
                if nm < 0:                          # any special restoration?
                    if last == None:
                        print >> sys.stderr , 'FATAL stemming logic error'
                        sys.stdout.flush()
                        sys.exit(1)
#                   print 'restore' , '[' + last + ']'
                    word.append(last)               # negative count restores last removed letter
                else:
#                   print 'drop' , nm , 'from [' , word , ']'
                    while nm > 0:                   # otherwise drop additional letters
                        if len(word) == 0:
                            print >> sys.stderr , 'FATAL stemming logic error'
                            sys.stdout.flush()
                            sys.exit(1)
                        last = word.pop()
                        nm -= 1

#               print 'extend=' , self.table[it]    # append more chars
                word.extend(self.table[it])

                token.root = word        # replace token with stemmed result
#               print 'word=' , word
#               print 'root=' , token.root
                        
                return isMTCH            # success flag

            elif opcode == NO:           # no match

#               print "fail!"
                return isNOTM

            elif opcode == IF:

                # enter logic block if a char sequence matches

                seq = self.table[it+1]
#               print 'seq=' , seq , 'ew=' , ew , 'word=' , word[:ew]
                sln = len(seq)
                if sln > len(word):           # enough chars to match?
                    it += self.table[it]      # if not, skip over block of logic
                else:
                    k = 0
#                   j = -1
#                   print 'at' , j , word[]
                    while k < sln and word[-k-1] == seq[k]:
#                       print 'word[' + str(k) + ']=' , word[j]
                        k += 1
#                       j -= 1
#                   print 'k=' , k
                    if k < sln:               # any characters unmatched?
#                       print 'IF no match'
                        it  += self.table[it] # if so, skip over block of logic
                    else:
#                       print 'IF match'
                        it += 2               # otherwise, enter logic block
                        word = word[:-sln]    # update index in word
                        
            elif opcode == IS:

                # check whether next character is in a specified set

                if len(word) <= 0:            # any letters left in word?
                    it += self.table[it]      # if not, skip over block
                    continue
                chs = self.table[it+1]        # get character set
                c = word[-1]
#               print c, ':', chs
                if chs.find(c) < 0:           # word character in set?

                    it += self.table[it]      # if not, skip block
                else:
                    it += 2                   # if so, enter block

            elif opcode < Nlen:

                # check length of word

                k = self.table[it+1]          # comparison length
#               print "k= ", k, " : m= ", m , 'opcode=' , opcode
                if   opcode == LT:            # set match flag for type of comparison
                    match = (m <  k)          #
                elif opcode == GT:            #
                    match = (m >  k)          #
                elif opcode == EQ:            #
                    match = (m == k)          #
                elif opcode == NE:            #
                    match = (m != k)          #
                else:
                    return isNOTM

#               print "match= ", match

                if not match:                 # if no match, skip block
                    it += self.table[it]
                else:                         # otherwise, go into logic of block
                    it += 2                   #

            elif opcode == MO:                # continue to another logic table

                token.root = token.root[:msh]
#               print 'for more, set root=' , token.root
                return doMORE                 # let other table figure out what to do

            elif opcode == VO:                # look for CVC pattern at end of stemming
                                              # and possibly restore -E

                word = token.root[:msh]       # strip ending from end of word
#               print 'vowel check for' , word

                me = len(word) - 2            # at possible vowel in stemming result
                                              # last char assumed to be consonant

#               print 'me=' , me
                if me < 0 or ellyChar.isStrongConsonant(word[me]):
                    token.root = word
                    return isMTCH

                me -= 1                       # vowel found; now check for consonant
#               print 'me=' , me

                if me < 0 or ellyChar.isStrictVowel(word[me]):
                    return isMTCH
                if me <= 0 or word[me] != u'u' or word[me - 1] == u'q':
                    word.append(u'e')         # put back -E
                token.root = word
#               print 'final word=' , word
                return isMTCH

            else:

                return isNOTM

        raise ellyException.StemmingError